Adding upstream version 18.2.2.upstream/18.2.2

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
commit: e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree: 64f88b554b444a49f656b6c656111a145cbbaa28 /src/crypto
parent: Initial commit. (diff)
download: ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
533 files changed, 162071 insertions, 0 deletions
diff --git a/src/crypto/CMakeLists.txt b/src/crypto/CMakeLists.txt
new file mode 100644
index 000000000..f930886d3
--- /dev/null
+++ b/src/crypto/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_custom_target(crypto_plugins)
+set(crypto_plugin_dir ${CEPH_INSTALL_PKGLIBDIR}/crypto)
+
+add_subdirectory(openssl)
+
+if(HAVE_INTEL AND HAVE_NASM_X64_AVX2 AND (NOT APPLE))
+  add_subdirectory(isa-l)
+endif()
+
+if(WITH_QAT)
+  add_subdirectory(qat)
+endif()
diff --git a/src/crypto/crypto_accel.h b/src/crypto/crypto_accel.h
new file mode 100644
index 000000000..5c1593609
--- /dev/null
+++ b/src/crypto/crypto_accel.h
@@ -0,0 +1,37 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Mirantis, Inc.
+ *
+ * Author: Adam Kupczyk <akupczyk@mirantis.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CRYPTO_ACCEL_H
+#define CRYPTO_ACCEL_H
+#include <cstddef>
+#include "include/Context.h"
+
+class CryptoAccel;
+typedef std::shared_ptr<CryptoAccel> CryptoAccelRef;
+
+class CryptoAccel {
+ public:
+  CryptoAccel() {}
+  virtual ~CryptoAccel() {}
+
+  static const int AES_256_IVSIZE = 128/8;
+  static const int AES_256_KEYSIZE = 256/8;
+  virtual bool cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+                   const unsigned char (&iv)[AES_256_IVSIZE],
+                   const unsigned char (&key)[AES_256_KEYSIZE]) = 0;
+  virtual bool cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+                   const unsigned char (&iv)[AES_256_IVSIZE],
+                   const unsigned char (&key)[AES_256_KEYSIZE]) = 0;
+};
+#endif
diff --git a/src/crypto/crypto_plugin.h b/src/crypto/crypto_plugin.h
new file mode 100644
index 000000000..cf22d5cb4
--- /dev/null
+++ b/src/crypto/crypto_plugin.h
@@ -0,0 +1,36 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Mirantis, Inc.
+ *
+ * Author: Adam Kupczyk <akupczyk@mirantis.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CRYPTO_PLUGIN_H
+#define CRYPTO_PLUGIN_H
+
+// -----------------------------------------------------------------------------
+#include "common/PluginRegistry.h"
+#include "ostream"
+
+#include "crypto/crypto_accel.h"
+// -----------------------------------------------------------------------------
+
+class CryptoPlugin : public ceph::Plugin {
+
+public:
+  CryptoAccelRef cryptoaccel;
+  explicit CryptoPlugin(CephContext* cct) : Plugin(cct)
+  {}
+  ~CryptoPlugin()
+  {}
+  virtual int factory(CryptoAccelRef *cs,
+                      std::ostream *ss) = 0;
+};
+#endif
diff --git a/src/crypto/isa-l/CMakeLists.txt b/src/crypto/isa-l/CMakeLists.txt
new file mode 100644
index 000000000..2a2ec0bc0
--- /dev/null
+++ b/src/crypto/isa-l/CMakeLists.txt
@@ -0,0 +1,36 @@
+set(isal_dir ${CMAKE_SOURCE_DIR}/src/crypto/isa-l/isa-l_crypto)
+set(CMAKE_ASM_FLAGS "-i ${isal_dir}/aes/ -i ${isal_dir}/include/ ${CMAKE_ASM_FLAGS}")
+
+set(isal_crypto_plugin_srcs
+  isal_crypto_accel.cc 
+  isal_crypto_plugin.cc
+  ${isal_dir}/aes/cbc_pre.c
+  ${isal_dir}/aes/cbc_multibinary.asm
+  ${isal_dir}/aes/keyexp_128.asm
+  ${isal_dir}/aes/keyexp_192.asm
+  ${isal_dir}/aes/keyexp_256.asm
+  ${isal_dir}/aes/keyexp_multibinary.asm
+  ${isal_dir}/aes/cbc_dec_128_x4_sse.asm
+  ${isal_dir}/aes/cbc_dec_128_x8_avx.asm
+  ${isal_dir}/aes/cbc_dec_192_x4_sse.asm
+  ${isal_dir}/aes/cbc_dec_192_x8_avx.asm
+  ${isal_dir}/aes/cbc_dec_256_x4_sse.asm
+  ${isal_dir}/aes/cbc_dec_256_x8_avx.asm
+  ${isal_dir}/aes/cbc_enc_128_x4_sb.asm
+  ${isal_dir}/aes/cbc_enc_128_x8_sb.asm
+  ${isal_dir}/aes/cbc_enc_192_x4_sb.asm
+  ${isal_dir}/aes/cbc_enc_192_x8_sb.asm
+  ${isal_dir}/aes/cbc_enc_256_x4_sb.asm
+  ${isal_dir}/aes/cbc_enc_256_x8_sb.asm)
+
+if(HAVE_NASM_X64)
+add_dependencies(crypto_plugins ceph_crypto_isal)
+endif(HAVE_NASM_X64)
+
+add_library(ceph_crypto_isal SHARED ${isal_crypto_plugin_srcs})
+target_include_directories(ceph_crypto_isal PRIVATE ${isal_dir}/include)
+set_target_properties(ceph_crypto_isal PROPERTIES
+  VERSION 1.0.0
+  SOVERSION 1
+  INSTALL_RPATH "")
+install(TARGETS ceph_crypto_isal DESTINATION ${crypto_plugin_dir})
diff --git a/src/crypto/isa-l/isa-l_crypto/.gitignore b/src/crypto/isa-l/isa-l_crypto/.gitignore
new file mode 100644
index 000000000..5d7ff17ad
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/.gitignore
@@ -0,0 +1,27 @@
+# Objects
+*~
+*.o
+*.lo
+*.so
+*.dll
+*.exp
+*.lib
+bin
+
+# Autobuild
+Makefile
+Makefile.in
+aclocal.m4
+autom4te.cache
+build-aux
+config.*
+configure
+.deps
+.dirstamp
+.libs
+libtool
+
+# Generated files
+isa-l_crypto.h
+/libisal_crypto.la
+libisal_crypto.pc
diff --git a/src/crypto/isa-l/isa-l_crypto/CONTRIBUTING.md b/src/crypto/isa-l/isa-l_crypto/CONTRIBUTING.md
new file mode 100644
index 000000000..3e95c0b54
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/CONTRIBUTING.md
@@ -0,0 +1,39 @@
+# Contributing to ISA-L_crypto
+
+Everyone is welcome to contribute. Patches may be submitted using GitHub pull
+requests (PRs). All commits must be signed off by the developer (--signoff)
+which indicates that you agree to the Developer Certificate of Origin.  Patch
+discussion will happen directly on the GitHub PR. Design pre-work and general
+discussion occurs on the [mailing list]. Anyone can provide feedback in either
+location and all discussion is welcome. Decisions on whether to merge patches
+will be handled by the maintainer.
+
+## License
+
+ISA-L_crypto is licensed using a BSD 3-clause [license]. All code submitted to
+the project is required to carry that license.
+
+## Certificate of Origin
+
+In order to get a clear contribution chain of trust we use the
+[signed-off-by language] used by the Linux kernel project.
+
+## Mailing List
+
+Contributors and users are welcome to submit new request on our roadmap, submit
+patches, file issues, and ask questions on our [mailing list].
+
+## Coding Style
+
+The coding style for ISA-L_crypto C code roughly follows linux kernel
+guidelines.  Use the included indent script to format C code.
+
+    ./tools/iindent your_files.c
+
+And use check format script before submitting.
+
+    ./tools/check_format.sh
+
+[mailing list]:https://lists.01.org/mailman/listinfo/isal
+[license]:LICENSE
+[signed-off-by language]:https://01.org/community/signed-process
diff --git a/src/crypto/isa-l/isa-l_crypto/Doxyfile b/src/crypto/isa-l/isa-l_crypto/Doxyfile
new file mode 100644
index 000000000..9b37aac53
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/Doxyfile
@@ -0,0 +1,31 @@
+PROJECT_NAME           = "Intel Intelligent Storage Acceleration Library Crypto"
+PROJECT_BRIEF          = "ISA-L_crypto API reference doc"
+
+OUTPUT_DIRECTORY       = generated_doc
+FULL_PATH_NAMES        = NO
+TAB_SIZE               = 8
+ALIASES                = "requires=\xrefitem requires \"Requires\" \"Instruction Set Requirements for arch-specific functions (non-multibinary)\""
+OPTIMIZE_OUTPUT_FOR_C  = YES
+HIDE_UNDOC_MEMBERS     = YES
+USE_MDFILE_AS_MAINPAGE = README.md
+
+INPUT                  = isa-l_crypto.h \
+			 include \
+			 README.md \
+			 CONTRIBUTING.md \
+			 Release_notes.txt
+
+EXCLUDE                = include/test.h include/memcpy_inline.h include/intrinreg.h include/endian_helper.h
+EXCLUDE_PATTERNS       = */include/*_multibinary.h
+EXAMPLE_PATH           = . aes md5_mb mh_sha1 mh_sha1_murmur3_x64_128 mh_sha256 rolling_hash sha1_mb sha256_mb sha512_mb
+PAPER_TYPE             = letter
+LATEX_SOURCE_CODE      = YES
+GENERATE_TREEVIEW      = YES
+MACRO_EXPANSION        = YES
+EXPAND_ONLY_PREDEF     = YES
+PREDEFINED             = "DECLARE_ALIGNED(n, a)=ALIGN n" \
+                         __declspec(x)='x' \
+                         align(x)='ALIGN \
+                         x'
+EXPAND_AS_DEFINED      = DECLARE_ALIGNED
+EXTENSION_MAPPING      = "txt=md"
diff --git a/src/crypto/isa-l/isa-l_crypto/LICENSE b/src/crypto/isa-l/isa-l_crypto/LICENSE
new file mode 100644
index 000000000..ecebef110
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/LICENSE
@@ -0,0 +1,26 @@
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/crypto/isa-l/isa-l_crypto/Makefile.am b/src/crypto/isa-l/isa-l_crypto/Makefile.am
new file mode 100644
index 000000000..9151aab1b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/Makefile.am
@@ -0,0 +1,161 @@
+EXTRA_DIST = autogen.sh Makefile.unx make.inc Makefile.nmake isa-l_crypto.def LICENSE README.md Doxyfile
+CLEANFILES =
+LDADD =
+AM_MAKEFLAGS = --no-print-directory
+noinst_HEADERS =
+pkginclude_HEADERS = include/test.h include/types.h include/endian_helper.h
+noinst_LTLIBRARIES =
+INCLUDE = -I $(srcdir)/include/
+
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = libisal_crypto.pc
+EXTRA_DIST += libisal_crypto.pc.in
+CLEANFILES += libisal_crypto.pc
+
+lsrc=
+src_include=
+extern_hdrs=
+other_src=
+check_tests=
+unit_tests=
+perf_tests=
+unit_tests_extra=
+perf_tests_extra=
+examples=
+other_tests=
+lsrc32=
+lsrc_x86_64=
+lsrc_x86_32=
+lsrc_aarch64=
+lsrc_base_aliases=
+unit_tests32=
+perf_tests32=
+
+# Include units
+include sha1_mb/Makefile.am
+include mh_sha1/Makefile.am
+include md5_mb/Makefile.am
+include sha256_mb/Makefile.am
+include sha512_mb/Makefile.am
+include mh_sha1_murmur3_x64_128/Makefile.am
+include mh_sha256/Makefile.am
+include rolling_hash/Makefile.am
+include sm3_mb/Makefile.am
+include aes/Makefile.am
+
+# LIB version info not necessarily the same as package version
+LIBISAL_CURRENT=2
+LIBISAL_REVISION=24
+LIBISAL_AGE=0
+
+lib_LTLIBRARIES = libisal_crypto.la
+pkginclude_HEADERS += $(sort ${extern_hdrs})
+libisal_crypto_la_SOURCES = ${lsrc}
+if CPU_X86_64
+libisal_crypto_la_SOURCES += ${lsrc_x86_64}
+endif
+
+if CPU_X86_32
+libisal_crypto_la_SOURCES += ${lsrc_x86_32}
+endif
+
+if CPU_AARCH64
+libisal_crypto_la_SOURCES += ${lsrc_aarch64}
+endif
+
+if CPU_UNDEFINED
+libisal_crypto_la_SOURCES += ${lsrc_base_aliases}
+endif
+
+nobase_include_HEADERS = isa-l_crypto.h
+libisal_crypto_la_LDFLAGS = $(AM_LDFLAGS) \
+	-version-info $(LIBISAL_CURRENT):$(LIBISAL_REVISION):$(LIBISAL_AGE)
+libisal_crypto_la_LIBADD = ${noinst_LTLIBRARIES}
+
+EXTRA_DIST += ${other_src}
+EXTRA_DIST += Release_notes.txt
+
+# For tests
+LDADD += libisal_crypto.la
+check_PROGRAMS = ${check_tests}
+TESTS = ${check_tests}
+
+# For additional tests
+EXTRA_PROGRAMS = ${unit_tests}
+EXTRA_PROGRAMS += ${perf_tests}
+EXTRA_PROGRAMS += ${other_tests}
+EXTRA_PROGRAMS += ${examples}
+CLEANFILES += ${EXTRA_PROGRAMS}
+
+perfs: ${perf_tests}
+tests: ${unit_tests}
+checks: ${check_tests}
+other: ${other_tests}
+perf: $(addsuffix .run,$(perf_tests))
+ex: ${examples}
+test: $(addsuffix .run,$(unit_tests))
+
+# Build rule to run tests
+%.run: %
+	$<
+	@echo Completed run: $<
+
+# Support for yasm/nasm
+if INTEL_CET_ENABLED
+  export CET_LD=$(LD)
+endif
+if USE_YASM
+if INTEL_CET_ENABLED
+  as_filter = ${srcdir}/tools/yasm-cet-filter.sh
+else
+  as_filter = ${srcdir}/tools/yasm-filter.sh
+endif
+endif
+if USE_NASM
+if INTEL_CET_ENABLED
+  as_filter = ${srcdir}/tools/nasm-cet-filter.sh
+else
+  as_filter = ${srcdir}/tools/nasm-filter.sh
+endif
+endif
+if CPU_AARCH64
+  as_filter = $(CC) -D__ASSEMBLY__
+endif
+CCAS = $(as_filter)
+EXTRA_DIST += tools/yasm-filter.sh tools/nasm-filter.sh
+EXTRA_DIST += tools/yasm-cet-filter.sh tools/nasm-cet-filter.sh
+
+AM_CFLAGS = ${my_CFLAGS} ${INCLUDE} $(src_include) ${D}
+AM_CCASFLAGS = ${yasm_args} ${INCLUDE} $(src_include) ${DEFS} ${D}
+
+.asm.s:
+	@echo "  MKTMP   " $@;
+	@cp $< $@
+
+# Generate isa-l_crypto.h
+BUILT_SOURCES = isa-l_crypto.h
+CLEANFILES += isa-l_crypto.h
+isa-l_crypto.h:
+	@echo 'Building $@'
+	@echo ''			>> $@
+	@echo '/**'			>> $@
+	@echo ' *  @file isa-l_crypto.h'>> $@
+	@echo ' *  @brief Include for ISA-L_crypto library'	>> $@
+	@echo ' */'			>> $@
+	@echo ''			>> $@
+	@echo '#ifndef _ISAL_CRYPTO_H_'	>> $@
+	@echo '#define _ISAL_CRYPTO_H_'	>> $@
+	@echo ''			>> $@
+	@echo '#define.ISAL_CRYPTO_MAJOR_VERSION.${VERSION}' |  ${AWK} -F . '{print $$1, $$2, $$3}' >> $@
+	@echo '#define.ISAL_CRYPTO_MINOR_VERSION.${VERSION}' |  ${AWK} -F . '{print $$1, $$2, $$4}' >> $@
+	@echo '#define.ISAL_CRYPTO_PATCH_VERSION.${VERSION}' |  ${AWK} -F . '{print $$1, $$2, $$5}' >> $@
+	@echo '#define ISAL_CRYPTO_MAKE_VERSION(maj, min, patch)  ((maj) * 0x10000 + (min) * 0x100 + (patch))' >> $@
+	@echo '#define ISAL_CRYPTO_VERSION ISAL_CRYPTO_MAKE_VERSION(ISAL_CRYPTO_MAJOR_VERSION, ISAL_CRYPTO_MINOR_VERSION, ISAL_CRYPTO_PATCH_VERSION)' >> $@
+	@echo ''			>> $@
+	@for unit in $(sort $(extern_hdrs)); do echo "#include <isa-l_crypto/$$unit>" | sed -e 's;include/;;' >> $@; done
+	@echo '#endif //_ISAL_CRYPTO_H_'	>> $@
+
+doc: isa-l_crypto.h
+	(cat Doxyfile; echo 'PROJECT_NUMBER=${VERSION}') | doxygen -
+	$(MAKE) -C generated_doc/latex &> generated_doc/latex_build_api.log
+	cp generated_doc/latex/refman.pdf isa-l_crypto_api_${VERSION}.pdf
diff --git a/src/crypto/isa-l/isa-l_crypto/Makefile.nmake b/src/crypto/isa-l/isa-l_crypto/Makefile.nmake
new file mode 100644
index 000000000..a3e577277
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/Makefile.nmake
@@ -0,0 +1,493 @@
+########################################################################
+#  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+# This file can be auto-regenerated with $make -f Makefile.unx Makefile.nmake
+
+objs = \
+	bin\sha1_ctx_sse.obj \
+	bin\sha1_ctx_avx.obj \
+	bin\sha1_ctx_avx2.obj \
+	bin\sha1_ctx_base.obj \
+	bin\sha1_mb_mgr_init_sse.obj \
+	bin\sha1_mb_mgr_init_avx2.obj \
+	bin\sha1_mb_mgr_submit_sse.obj \
+	bin\sha1_mb_mgr_submit_avx.obj \
+	bin\sha1_mb_mgr_submit_avx2.obj \
+	bin\sha1_mb_mgr_flush_sse.obj \
+	bin\sha1_mb_mgr_flush_avx.obj \
+	bin\sha1_mb_mgr_flush_avx2.obj \
+	bin\sha1_mb_x4_sse.obj \
+	bin\sha1_mb_x4_avx.obj \
+	bin\sha1_mb_x8_avx2.obj \
+	bin\sha1_multibinary.obj \
+	bin\sha1_ctx_avx512.obj \
+	bin\sha1_mb_mgr_init_avx512.obj \
+	bin\sha1_mb_mgr_submit_avx512.obj \
+	bin\sha1_mb_mgr_flush_avx512.obj \
+	bin\sha1_mb_x16_avx512.obj \
+	bin\sha1_opt_x1.obj \
+	bin\sha1_ni_x1.obj \
+	bin\sha1_ni_x2.obj \
+	bin\sha1_ctx_sse_ni.obj \
+	bin\sha1_ctx_avx512_ni.obj \
+	bin\sha1_mb_mgr_submit_sse_ni.obj \
+	bin\sha1_mb_mgr_flush_sse_ni.obj \
+	bin\sha1_mb_mgr_flush_avx512_ni.obj \
+	bin\sha256_ctx_sse.obj \
+	bin\sha256_ctx_avx.obj \
+	bin\sha256_ctx_avx2.obj \
+	bin\sha256_ctx_base.obj \
+	bin\sha256_mb_mgr_init_sse.obj \
+	bin\sha256_mb_mgr_init_avx2.obj \
+	bin\sha256_mb_mgr_submit_sse.obj \
+	bin\sha256_mb_mgr_submit_avx.obj \
+	bin\sha256_mb_mgr_submit_avx2.obj \
+	bin\sha256_mb_mgr_flush_sse.obj \
+	bin\sha256_mb_mgr_flush_avx.obj \
+	bin\sha256_mb_mgr_flush_avx2.obj \
+	bin\sha256_mb_x4_sse.obj \
+	bin\sha256_mb_x4_avx.obj \
+	bin\sha256_mb_x8_avx2.obj \
+	bin\sha256_multibinary.obj \
+	bin\sha256_ctx_avx512.obj \
+	bin\sha256_mb_mgr_init_avx512.obj \
+	bin\sha256_mb_mgr_submit_avx512.obj \
+	bin\sha256_mb_mgr_flush_avx512.obj \
+	bin\sha256_mb_x16_avx512.obj \
+	bin\sha256_opt_x1.obj \
+	bin\sha256_ni_x1.obj \
+	bin\sha256_ni_x2.obj \
+	bin\sha256_ctx_sse_ni.obj \
+	bin\sha256_ctx_avx512_ni.obj \
+	bin\sha256_mb_mgr_submit_sse_ni.obj \
+	bin\sha256_mb_mgr_flush_sse_ni.obj \
+	bin\sha256_mb_mgr_flush_avx512_ni.obj \
+	bin\sha512_ctx_sse.obj \
+	bin\sha512_ctx_avx.obj \
+	bin\sha512_ctx_avx2.obj \
+	bin\sha512_ctx_sb_sse4.obj \
+	bin\sha512_ctx_base.obj \
+	bin\sha512_mb_mgr_init_sse.obj \
+	bin\sha512_mb_mgr_init_avx2.obj \
+	bin\sha512_sb_mgr_init_sse4.obj \
+	bin\sha512_mb_mgr_submit_sse.obj \
+	bin\sha512_mb_mgr_submit_avx.obj \
+	bin\sha512_mb_mgr_submit_avx2.obj \
+	bin\sha512_mb_mgr_flush_sse.obj \
+	bin\sha512_mb_mgr_flush_avx.obj \
+	bin\sha512_mb_mgr_flush_avx2.obj \
+	bin\sha512_mb_x2_sse.obj \
+	bin\sha512_mb_x2_avx.obj \
+	bin\sha512_mb_x4_avx2.obj \
+	bin\sha512_multibinary.obj \
+	bin\sha512_sb_mgr_submit_sse4.obj \
+	bin\sha512_sb_mgr_flush_sse4.obj \
+	bin\sha512_sse4.obj \
+	bin\sha512_ctx_avx512.obj \
+	bin\sha512_mb_mgr_init_avx512.obj \
+	bin\sha512_mb_mgr_submit_avx512.obj \
+	bin\sha512_mb_mgr_flush_avx512.obj \
+	bin\sha512_mb_x8_avx512.obj \
+	bin\md5_ctx_sse.obj \
+	bin\md5_ctx_avx.obj \
+	bin\md5_ctx_avx2.obj \
+	bin\md5_ctx_base.obj \
+	bin\md5_mb_mgr_init_sse.obj \
+	bin\md5_mb_mgr_init_avx2.obj \
+	bin\md5_mb_mgr_init_avx512.obj \
+	bin\md5_mb_mgr_submit_sse.obj \
+	bin\md5_mb_mgr_submit_avx.obj \
+	bin\md5_mb_mgr_submit_avx2.obj \
+	bin\md5_mb_mgr_flush_sse.obj \
+	bin\md5_mb_mgr_flush_avx.obj \
+	bin\md5_mb_mgr_flush_avx2.obj \
+	bin\md5_mb_x4x2_sse.obj \
+	bin\md5_mb_x4x2_avx.obj \
+	bin\md5_mb_x8x2_avx2.obj \
+	bin\md5_multibinary.obj \
+	bin\md5_mb_mgr_submit_avx512.obj \
+	bin\md5_mb_mgr_flush_avx512.obj \
+	bin\md5_mb_x16x2_avx512.obj \
+	bin\md5_ctx_avx512.obj \
+	bin\mh_sha1_block_base.obj \
+	bin\mh_sha1_finalize_base.obj \
+	bin\mh_sha1_update_base.obj \
+	bin\sha1_for_mh_sha1.obj \
+	bin\mh_sha1.obj \
+	bin\mh_sha1_multibinary.obj \
+	bin\mh_sha1_block_sse.obj \
+	bin\mh_sha1_block_avx.obj \
+	bin\mh_sha1_block_avx2.obj \
+	bin\mh_sha1_block_avx512.obj \
+	bin\mh_sha1_avx512.obj \
+	bin\murmur3_x64_128_internal.obj \
+	bin\mh_sha1_murmur3_x64_128.obj \
+	bin\mh_sha1_murmur3_x64_128_finalize_base.obj \
+	bin\mh_sha1_murmur3_x64_128_update_base.obj \
+	bin\mh_sha1_murmur3_x64_128_block_sse.obj \
+	bin\mh_sha1_murmur3_x64_128_block_avx.obj \
+	bin\mh_sha1_murmur3_x64_128_block_avx2.obj \
+	bin\mh_sha1_murmur3_x64_128_multibinary.obj \
+	bin\mh_sha1_murmur3_x64_128_avx512.obj \
+	bin\mh_sha1_murmur3_x64_128_block_avx512.obj \
+	bin\sha256_for_mh_sha256.obj \
+	bin\mh_sha256.obj \
+	bin\mh_sha256_block_sse.obj \
+	bin\mh_sha256_block_avx.obj \
+	bin\mh_sha256_block_avx2.obj \
+	bin\mh_sha256_multibinary.obj \
+	bin\mh_sha256_finalize_base.obj \
+	bin\mh_sha256_update_base.obj \
+	bin\mh_sha256_block_base.obj \
+	bin\mh_sha256_block_avx512.obj \
+	bin\mh_sha256_avx512.obj \
+	bin\rolling_hashx_base.obj \
+	bin\rolling_hash2.obj \
+	bin\rolling_hash2_until_04.obj \
+	bin\rolling_hash2_until_00.obj \
+	bin\rolling_hash2_multibinary.obj \
+	bin\sm3_ctx_base.obj \
+	bin\sm3_multibinary.obj \
+	bin\sm3_ctx_avx512.obj \
+	bin\sm3_mb_mgr_submit_avx512.obj \
+	bin\sm3_mb_mgr_flush_avx512.obj \
+	bin\sm3_mb_x16_avx512.obj \
+	bin\sm3_ctx_avx2.obj \
+	bin\sm3_mb_mgr_submit_avx2.obj \
+	bin\sm3_mb_mgr_flush_avx2.obj \
+	bin\sm3_mb_x8_avx2.obj \
+	bin\gcm_multibinary.obj \
+	bin\gcm_pre.obj \
+	bin\gcm128_avx_gen2.obj \
+	bin\gcm128_avx_gen4.obj \
+	bin\gcm128_sse.obj \
+	bin\gcm256_avx_gen2.obj \
+	bin\gcm256_avx_gen4.obj \
+	bin\gcm256_sse.obj \
+	bin\gcm128_vaes_avx512.obj \
+	bin\gcm256_vaes_avx512.obj \
+	bin\gcm128_avx_gen2_nt.obj \
+	bin\gcm128_avx_gen4_nt.obj \
+	bin\gcm128_sse_nt.obj \
+	bin\gcm256_avx_gen2_nt.obj \
+	bin\gcm256_avx_gen4_nt.obj \
+	bin\gcm256_sse_nt.obj \
+	bin\gcm128_vaes_avx512_nt.obj \
+	bin\gcm256_vaes_avx512_nt.obj \
+	bin\gcm_multibinary_nt.obj \
+	bin\keyexp_multibinary.obj \
+	bin\keyexp_128.obj \
+	bin\keyexp_192.obj \
+	bin\keyexp_256.obj \
+	bin\cbc_multibinary.obj \
+	bin\cbc_dec_128_x4_sse.obj \
+	bin\cbc_dec_128_x8_avx.obj \
+	bin\cbc_dec_192_x4_sse.obj \
+	bin\cbc_dec_192_x8_avx.obj \
+	bin\cbc_dec_256_x4_sse.obj \
+	bin\cbc_dec_256_x8_avx.obj \
+	bin\cbc_enc_128_x4_sb.obj \
+	bin\cbc_enc_128_x8_sb.obj \
+	bin\cbc_enc_192_x4_sb.obj \
+	bin\cbc_enc_192_x8_sb.obj \
+	bin\cbc_enc_256_x4_sb.obj \
+	bin\cbc_enc_256_x8_sb.obj \
+	bin\cbc_dec_vaes_avx512.obj \
+	bin\cbc_pre.obj \
+	bin\xts_aes_128_multibinary.obj \
+	bin\XTS_AES_128_dec_sse.obj \
+	bin\XTS_AES_128_dec_expanded_key_sse.obj \
+	bin\XTS_AES_128_enc_sse.obj \
+	bin\XTS_AES_128_enc_expanded_key_sse.obj \
+	bin\XTS_AES_128_dec_avx.obj \
+	bin\XTS_AES_128_dec_expanded_key_avx.obj \
+	bin\XTS_AES_128_enc_avx.obj \
+	bin\XTS_AES_128_enc_expanded_key_avx.obj \
+	bin\xts_aes_256_multibinary.obj \
+	bin\XTS_AES_256_dec_avx.obj \
+	bin\XTS_AES_256_dec_expanded_key_avx.obj \
+	bin\XTS_AES_256_enc_avx.obj \
+	bin\XTS_AES_256_enc_expanded_key_avx.obj \
+	bin\XTS_AES_256_dec_sse.obj \
+	bin\XTS_AES_256_dec_expanded_key_sse.obj \
+	bin\XTS_AES_256_enc_sse.obj \
+	bin\XTS_AES_256_enc_expanded_key_sse.obj \
+	bin\XTS_AES_256_enc_vaes.obj \
+	bin\XTS_AES_128_enc_vaes.obj \
+	bin\XTS_AES_256_enc_expanded_key_vaes.obj \
+	bin\XTS_AES_128_enc_expanded_key_vaes.obj \
+	bin\XTS_AES_256_dec_vaes.obj \
+	bin\XTS_AES_128_dec_vaes.obj \
+	bin\XTS_AES_256_dec_expanded_key_vaes.obj \
+	bin\XTS_AES_128_dec_expanded_key_vaes.obj
+
+INCLUDES  = -I./ -Isha1_mb/ -Isha256_mb/ -Isha512_mb/ -Imd5_mb/ -Imh_sha1/ -Imh_sha1_murmur3_x64_128/ -Imh_sha256/ -Irolling_hash/ -Ism3_mb/ -Iaes/ -Iinclude/
+# Modern asm feature level, consider upgrading nasm/yasm before decreasing feature_level
+FEAT_FLAGS = -DHAVE_AS_KNOWS_AVX512 -DAS_FEATURE_LEVEL=10 -DHAVE_AS_KNOWS_SHANI
+CFLAGS_REL = -O2 -DNDEBUG /Z7 /MD /Gy
+CFLAGS_DBG = -Od -DDEBUG /Z7 /MDd
+LINKFLAGS  = -nologo -incremental:no -debug
+CFLAGS     = $(CFLAGS_REL) -nologo -D_USE_MATH_DEFINES $(FEAT_FLAGS) $(INCLUDES) $(D)
+AFLAGS     = -f win64 $(FEAT_FLAGS) $(INCLUDES) $(D)
+CC         = cl
+# or CC    = icl -Qstd=c99
+AS         = nasm
+
+lib: bin static dll
+static: bin isa-l_crypto_static.lib
+dll: bin isa-l_crypto.dll
+
+bin: ; -mkdir $@
+
+isa-l_crypto_static.lib: $(objs)
+	lib -out:$@ @<<
+$?
+<<
+
+isa-l_crypto.dll: $(objs)
+	link -out:$@ -dll -def:isa-l_crypto.def $(LINKFLAGS) @<<
+$?
+<<
+
+{sha1_mb}.c.obj:
+	$(CC) $(CFLAGS) /c -Fo$@ $?
+{sha1_mb}.asm.obj:
+	$(AS) $(AFLAGS) -o $@ $?
+
+{sha256_mb}.c.obj:
+	$(CC) $(CFLAGS) /c -Fo$@ $?
+{sha256_mb}.asm.obj:
+	$(AS) $(AFLAGS) -o $@ $?
+
+{sha512_mb}.c.obj:
+	$(CC) $(CFLAGS) /c -Fo$@ $?
+{sha512_mb}.asm.obj:
+	$(AS) $(AFLAGS) -o $@ $?
+
+{md5_mb}.c.obj:
+	$(CC) $(CFLAGS) /c -Fo$@ $?
+{md5_mb}.asm.obj:
+	$(AS) $(AFLAGS) -o $@ $?
+
+{mh_sha1}.c.obj:
+	$(CC) $(CFLAGS) /c -Fo$@ $?
+{mh_sha1}.asm.obj:
+	$(AS) $(AFLAGS) -o $@ $?
+
+{mh_sha1_murmur3_x64_128}.c.obj:
+	$(CC) $(CFLAGS) /c -Fo$@ $?
+{mh_sha1_murmur3_x64_128}.asm.obj:
+	$(AS) $(AFLAGS) -o $@ $?
+
+{mh_sha256}.c.obj:
+	$(CC) $(CFLAGS) /c -Fo$@ $?
+{mh_sha256}.asm.obj:
+	$(AS) $(AFLAGS) -o $@ $?
+
+{rolling_hash}.c.obj:
+	$(CC) $(CFLAGS) /c -Fo$@ $?
+{rolling_hash}.asm.obj:
+	$(AS) $(AFLAGS) -o $@ $?
+
+{sm3_mb}.c.obj:
+	$(CC) $(CFLAGS) /c -Fo$@ $?
+{sm3_mb}.asm.obj:
+	$(AS) $(AFLAGS) -o $@ $?
+
+{aes}.c.obj:
+	$(CC) $(CFLAGS) /c -Fo$@ $?
+{aes}.asm.obj:
+	$(AS) $(AFLAGS) -o $@ $?
+
+
+# Examples
+ex = \
+	sha1_multi_buffer_example.exe \
+	gcm_simple_example.exe
+
+ex: lib $(ex)
+
+$(ex): $(@B).obj
+
+.obj.exe:
+	link /out:$@ $(LINKFLAGS) isa-l_crypto.lib $?
+
+# Check tests
+checks = \
+	sha1_mb_test.exe \
+	sha1_mb_rand_test.exe \
+	sha1_mb_rand_update_test.exe \
+	sha1_mb_flush_test.exe \
+	sha256_mb_test.exe \
+	sha256_mb_rand_test.exe \
+	sha256_mb_rand_update_test.exe \
+	sha256_mb_flush_test.exe \
+	sha512_mb_test.exe \
+	sha512_mb_rand_test.exe \
+	sha512_mb_rand_update_test.exe \
+	md5_mb_test.exe \
+	md5_mb_rand_test.exe \
+	md5_mb_rand_update_test.exe \
+	mh_sha1_test.exe \
+	mh_sha256_test.exe \
+	rolling_hash2_test.exe \
+	sm3_ref_test.exe \
+	cbc_std_vectors_test.exe \
+	gcm_std_vectors_test.exe \
+	gcm_nt_std_vectors_test.exe \
+	xts_128_test.exe \
+	xts_256_test.exe \
+	xts_128_expanded_key_test.exe \
+	xts_256_expanded_key_test.exe
+
+checks: lib $(checks)
+$(checks): $(@B).obj
+check: $(checks)
+	!$?
+
+# Unit tests
+tests = \
+	sha1_mb_rand_ssl_test.exe \
+	sha256_mb_rand_ssl_test.exe \
+	sha512_mb_rand_ssl_test.exe \
+	md5_mb_rand_ssl_test.exe \
+	mh_sha1_update_test.exe \
+	mh_sha1_murmur3_x64_128_test.exe \
+	mh_sha1_murmur3_x64_128_update_test.exe \
+	mh_sha256_update_test.exe \
+	sm3_mb_rand_ssl_test.exe \
+	sm3_mb_rand_test.exe \
+	sm3_mb_rand_update_test.exe \
+	sm3_mb_flush_test.exe \
+	sm3_mb_test.exe \
+	cbc_std_vectors_random_test.exe \
+	gcm_std_vectors_random_test.exe \
+	gcm_nt_rand_test.exe \
+	xts_128_rand.exe \
+	xts_128_rand_ossl_test.exe \
+	xts_256_rand.exe \
+	xts_256_rand_ossl_test.exe
+
+tests: lib $(tests)
+$(tests): $(@B).obj
+
+# Performance tests
+perfs = \
+	sha1_mb_vs_ossl_perf.exe \
+	sha1_mb_vs_ossl_shortage_perf.exe \
+	sha256_mb_vs_ossl_perf.exe \
+	sha256_mb_vs_ossl_shortage_perf.exe \
+	sha512_mb_vs_ossl_perf.exe \
+	md5_mb_vs_ossl_perf.exe \
+	mh_sha1_perf.exe \
+	mh_sha1_murmur3_x64_128_perf.exe \
+	mh_sha256_perf.exe \
+	rolling_hash2_perf.exe \
+	sm3_mb_vs_ossl_perf.exe \
+	sm3_mb_vs_ossl_shortage_perf.exe \
+	cbc_ossl_perf.exe \
+	gcm_ossl_perf.exe \
+	xts_128_enc_ossl_perf.exe \
+	xts_256_enc_ossl_perf.exe \
+	xts_128_enc_perf.exe \
+	xts_128_dec_perf.exe \
+	xts_128_dec_ossl_perf.exe \
+	xts_256_enc_perf.exe \
+	xts_256_dec_perf.exe \
+	xts_256_dec_ossl_perf.exe
+
+perfs: lib $(perfs)
+$(perfs): $(@B).obj
+
+progs =
+
+progs: lib $(progs)
+
+clean:
+	-if exist *.obj del *.obj
+	-if exist bin\*.obj del bin\*.obj
+	-if exist isa-l_crypto_static.lib del isa-l_crypto_static.lib
+	-if exist *.exe del *.exe
+	-if exist *.pdb del *.pdb
+	-if exist isa-l_crypto.lib del isa-l_crypto.lib
+	-if exist isa-l_crypto.dll del isa-l_crypto.dll
+	-if exist isa-l_crypto.exp del isa-l_crypto.exp
+
+libcrypto.lib:
+sha1_mb_rand_test.exe: sha1_ref.obj
+sha1_mb_rand_update_test.exe: sha1_ref.obj
+sha1_mb_flush_test.exe: sha1_ref.obj
+sha1_mb_rand_ssl_test.exe:  libcrypto.lib
+sha1_mb_vs_ossl_perf.exe:  libcrypto.lib
+sha1_mb_vs_ossl_shortage_perf.exe:  libcrypto.lib
+sha256_mb_rand_ssl_test.exe: sha256_ref.obj
+sha256_mb_rand_test.exe: sha256_ref.obj
+sha256_mb_rand_update_test.exe: sha256_ref.obj
+sha256_mb_flush_test.exe: sha256_ref.obj
+sha256_mb_rand_ssl_test.exe:  libcrypto.lib
+sha256_mb_vs_ossl_perf.exe:  libcrypto.lib
+sha256_mb_vs_ossl_shortage_perf.exe:  libcrypto.lib
+sha512_mb_rand_test.exe: sha512_ref.obj
+sha512_mb_rand_update_test.exe: sha512_ref.obj
+sha512_mb_rand_ssl_test.exe:  libcrypto.lib
+sha512_mb_vs_ossl_perf.exe:  libcrypto.lib
+md5_mb_rand_test.exe: md5_ref.obj
+md5_mb_rand_update_test.exe: md5_ref.obj
+md5_mb_rand_ssl_test.exe:  libcrypto.lib
+md5_mb_vs_ossl_perf.exe:  libcrypto.lib
+mh_sha1_test.exe: mh_sha1_ref.obj
+mh_sha1_update_test.exe: mh_sha1_ref.obj
+mh_sha1_murmur3_x64_128_test.exe: mh_sha1_ref.obj murmur3_x64_128.obj
+mh_sha1_murmur3_x64_128_update_test.exe: mh_sha1_ref.obj murmur3_x64_128.obj
+mh_sha1_murmur3_x64_128_perf.exe: mh_sha1_ref.obj murmur3_x64_128.obj
+mh_sha256_test.exe: mh_sha256_ref.obj
+mh_sha256_update_test.exe: mh_sha256_ref.obj
+sm3_mb_rand_ssl_test.exe:  libcrypto.lib
+sm3_mb_rand_ssl_test.exe: sm3_test_helper.obj
+sm3_mb_rand_update_test.exe:  libcrypto.lib
+sm3_mb_rand_update_test.exe: sm3_test_helper.obj
+sm3_mb_flush_test.exe:  libcrypto.lib
+sm3_mb_flush_test.exe: sm3_test_helper.obj
+sm3_mb_rand_test.exe:  libcrypto.lib
+sm3_mb_rand_test.exe: sm3_test_helper.obj
+sm3_mb_vs_ossl_perf.exe:  libcrypto.lib
+sm3_mb_vs_ossl_perf.exe: sm3_test_helper.obj
+sm3_mb_vs_ossl_shortage_perf.exe:  libcrypto.lib
+sm3_mb_vs_ossl_shortage_perf.exe: sm3_test_helper.obj
+cbc_ossl_perf.exe:  libcrypto.lib
+cbc_std_vectors_random_test.exe:  libcrypto.lib
+gcm_ossl_perf.exe:  libcrypto.lib
+gcm_std_vectors_random_test.exe:  libcrypto.lib
+gcm_nt_rand_test.exe:  libcrypto.lib
+xts_128_enc_ossl_perf.exe:  libcrypto.lib
+xts_128_dec_ossl_perf.exe:  libcrypto.lib
+xts_128_rand_ossl_test.exe:  libcrypto.lib
+xts_256_enc_ossl_perf.exe:  libcrypto.lib
+xts_256_dec_ossl_perf.exe:  libcrypto.lib
+xts_256_rand_ossl_test.exe:  libcrypto.lib
diff --git a/src/crypto/isa-l/isa-l_crypto/Makefile.unx b/src/crypto/isa-l/isa-l_crypto/Makefile.unx
new file mode 100644
index 000000000..7452f71b0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/Makefile.unx
@@ -0,0 +1,50 @@
+########################################################################
+#  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+host_cpu ?= $(shell uname -m | sed -e 's/amd/x86_/')
+arch ?= $(shell uname | grep -v -e Linux -e BSD )
+
+
+
+units ?=sha1_mb sha256_mb sha512_mb md5_mb mh_sha1 mh_sha1_murmur3_x64_128 \
+	mh_sha256 rolling_hash sm3_mb
+ifneq ($(arch),noarch)
+units +=aes
+endif
+ifeq ($(host_cpu)_$(arch),aarch64_)
+  arch = aarch64
+endif
+default: lib
+include $(foreach unit,$(units), $(unit)/Makefile.am)
+
+# Override individual lib names to make one inclusive library.
+lib_name := bin/isa-l_crypto.a
+
+include make.inc
+include tools/gen_nmake.mk
+VPATH = . $(units) include
diff --git a/src/crypto/isa-l/isa-l_crypto/README.md b/src/crypto/isa-l/isa-l_crypto/README.md
new file mode 100644
index 000000000..f9f560c54
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/README.md
@@ -0,0 +1,63 @@
+Intel(R) Intelligent Storage Acceleration Library Crypto Version
+================================================================
+
+ISA-L_crypto is a collection of optimized low-level functions targeting storage
+applications.  ISA-L_crypto includes:
+
+* Multi-buffer hashes - run multiple hash jobs together on one core for much
+  better throughput than single-buffer versions.
+  - SHA1, SHA256, SHA512, MD5, SM3
+
+* Multi-hash - Get the performance of multi-buffer hashing with a single-buffer
+  interface. Specification ref : [Multi-Hash white paper](https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/multi-hash-paper.pdf)
+
+* Multi-hash + murmur - run both together.
+
+* AES - block ciphers
+  - XTS, GCM, CBC
+
+* Rolling hash - Hash input in a window which moves through the input
+
+Also see:
+* [ISA-L_crypto for updates](https://github.com/intel/isa-l_crypto).
+* For non-crypto ISA-L see [isa-l on github](https://github.com/intel/isa-l).
+* The [github wiki](https://github.com/intel/isa-l/wiki) covering isa-l and
+  isa-l crypto.
+* [Contributing](CONTRIBUTING.md).
+
+Building ISA-L
+--------------
+
+### Prerequisites
+
+* Assembler: nasm v2.11.01 or later (nasm v2.13 or better suggested for building in AVX512 support)
+  or yasm version 1.2.0 or later.
+* Compiler: gcc, clang, icc or VC compiler.
+* Make: GNU 'make' or 'nmake' (Windows).
+* Optional: Building with autotools requires autoconf/automake packages.
+
+### Autotools
+To build and install the library with autotools it is usually sufficient to run:
+
+    ./autogen.sh
+    ./configure
+    make
+    sudo make install
+
+### Makefile
+To use a standard makefile run:
+
+    make -f Makefile.unx
+
+### Windows
+On Windows use nmake to build dll and static lib:
+
+    nmake -f Makefile.nmake
+
+### Other make targets
+Other targets include:
+* `make check` : create and run tests
+* `make tests` : create additional unit tests
+* `make perfs` : create included performance tests
+* `make ex`    : build examples
+* `make doc`   : build API manual
diff --git a/src/crypto/isa-l/isa-l_crypto/Release_notes.txt b/src/crypto/isa-l/isa-l_crypto/Release_notes.txt
new file mode 100644
index 000000000..097107585
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/Release_notes.txt
@@ -0,0 +1,215 @@
+================================================================================
+v2.24 Intel Intelligent Storage Acceleration Library Crypto Release Notes
+================================================================================
+
+================================================================================
+RELEASE NOTE CONTENTS
+================================================================================
+1. KNOWN ISSUES
+2. FIXED ISSUES
+3. CHANGE LOG & FEATURES ADDED
+
+================================================================================
+1.  KNOWN ISSUES
+================================================================================
+
+* Perf tests do not run in Windows environment.
+
+* 32-bit lib is not supported in Windows.
+
+================================================================================
+2. FIXED ISSUES
+================================================================================
+v2.21
+
+* Put correct vec instruction versions in aes_cbc_enc_{128,192,256}().  May help
+  performance on some systems.
+
+v2.20
+
+* Fix issue with new aes_gcm API, aes_gcm_pre_256 was incorrect.
+
+* Multi-buffer hash max length extended.  Previous max length for
+  {sha1,sha256,sha512,md5}_mb was 4095MB.  While there is still a 4GB limit for
+  each submit, the total hashed length can now be larger then 4GB.
+
+v2.18
+
+* Fix for multi-buffer hash when total length is above 512MB.
+
+v2.14
+
+* Building in unit directories is no longer supported removing the issue of
+  leftover object files causing the top-level make build to fail.
+
+v2.9
+
+* Multi-buffer MD5 AVX2 tests fixed to work on FreeBSD 9.1 by explicitly aligning
+  structures.
+
+v2.7
+
+* Unit tests and examples are now supported in Windows environment
+
+
+================================================================================
+3. CHANGE LOG & FEATURES ADDED
+================================================================================
+v2.24
+
+* New optimized version of AES-CBC decode
+
+* New AVX2, 8 lane version of multi-buffer SM3
+
+* Added support for big-endian architectures
+
+v2.23
+
+* New optimized versions of block ciphers AES-GCM and AES-XTS.
+
+* New optimized versions of multi-buffer SM3 hashing.  Removed experimental
+  status.
+
+v2.22
+
+* New multi-buffer SM3 functions. Experimental base functions only.
+
+* New multi-arch support.
+
+v2.21
+
+* Multi-buffer hash performance improvement for Intel(R) Atom(tm) processors.
+  New by-2 shani versions for multi-buffer sha1 & sha256.
+
+* New base functions for multi-buffer hashes.
+  md5_mb, sha1_mb, sha256_mb, sha512_mb.
+
+v2.20
+
+* New functions
+  - Non-temporal versions of aes_gcm added.
+
+* Multi-buffer hash improvement
+  - Increase max length of hash in {sha1,sha256,sha512,md5}_mb to > 4GB.
+
+v2.19
+
+* Multi-buffer hash (sha1_mb, sha256_mb)
+
+  - Choose fast single buffer routine to do flush operation if lanes aren't full.
+
+  - Add SHA-NI support for Goldmont and Cannonlake.
+
+* AES-GCM interface updates.
+
+  - New interface separates the expanded keys and other context into two
+  structures.  The old interface is maintained for backward compatibility.
+
+  - User no longer has to append the GCM_IV_END_MARK manually to then end of iv
+  as this is now done automatically.  This update should also improve performance
+  of small packets.
+
+* Rolling hash is released.
+
+v2.18
+
+* New multi-hash SHA256-based version.
+
+v2.16
+
+* Split lib from non-crypto functions.
+
+v2.15
+
+* Multi-buffer hash updates.  New AVX512 versions for multi-buffer SHA1, SHA256,
+  SHA512, MD5 and SHA1+murmur3_x64_128 stitched.
+
+* Removes restrictions on AAD length in AES-GCM.  Previously AAD length was
+  limited to a multiple of 4 bytes.  Now any AAD length is allowed.
+
+* Nasm support.  ISA-L ported to build with nasm or yasm assembler.
+
+* Windows DLL support.  Windows builds DLL by default.
+
+* The older, deprecated multi-buffer API has been removed.
+
+v2.14
+
+* New multi-hash sha1 function and multi-hash sha1 + murmur3_x64_128 stitched.
+  Multi-hash is designed to give the performance of multi-buffer cryptographic
+  hashes with a synchronous single buffer interface.
+
+* New AES-GCM and AES-CBC functions added.
+
+* Autoconf and autotools build allows easier porting to additional systems.
+  Previous make system still available to embedded users with Makefile.unx.
+
+* The AES key expand functions that were used for AES-XTS with pre-expanded keys
+  now expand the decrypt keys in a different order. The order that decrypt keys
+  are stored and used by XTS_AES_128_dec_expanded_key() is reversed from
+  previous versions to be compatable with CBC and GCM key expansion. The
+  aes_keyexp_*() and XTS_AES_128_dec_expanded_key() functions should work the
+  same when paired together.
+
+* Includes update for building on Mac OS X/darwin systems. Add --target=darwin
+  to ./configure step.
+
+v2.10
+
+* Added multi-buffer MD5 in the new hash API.  Includes multi-binary capability,
+  no restriction on update length and other benefits of the CTX API.
+
+v2.9
+
+* New multi-buffer hash API.  The new API brings the following new features to
+  multi-buffer hashes.  The older API is still included but may be deprecated in
+  future releases.
+
+  - Multibinary functionality.  Call one function and the appropriate
+    architecture-specific version is fixed up at runtime.
+
+  - No restriction on update length.  Submitting an update block no longer has
+    to have length a multiple of the fundamental block size.
+
+* New expanded key tests added for AES-XTS 128 and 256
+
+v2.7
+
+* New AVX2 versions for mb_md5 and mb_sha512 hashing code have been added.
+
+v2.6
+
+* Update buffer functionality added to mb_md5, mb_sha256 and mb_sha512 hashing
+  code.  Requires API changes to current interface to specify job type and total
+  length of hash.
+
+* New AVX2 versions for mb_sha1 and mb_sha256 hashing code have been added.
+
+v2.5
+
+* New feature for multi-buffer SHA-1, update buffer.  mb_sha1 non-finalize jobs
+  can now be submitted by setting flags in job structure.  Requires API changes
+  to current interface to specify job type and total length of hash.
+
+v2.4
+
+* Added new multi-buffer SHA-512: mb_sha512.  SSE, AVX versions.
+
+v2.3
+
+* Added improved AES XTS versions.
+
+v2.2
+
+* Added new AVX versions of multi-buffer hashes
+* Changed type in the interface struct for multi-buffer hashes
+  the len field in the following structures :JOB_SHA1,JOB_MD5,
+  JOB_SHA256 is now a 32-bit int.
+
+v2.0
+
+* Added AES XTS units aes_xts_128, aes_xts_256
+
+v1.3
+
+* Added new multi-buffer units for SHA-256 and MD5: mb_sha256, mb_md5.
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/Makefile.am b/src/crypto/isa-l/isa-l_crypto/aes/Makefile.am
new file mode 100644
index 000000000..d1f4e5781
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/Makefile.am
@@ -0,0 +1,170 @@
+########################################################################
+#  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+# Assembling AES requires including cbc_common.asm, gcm_defines.asm
+src_include += -I $(srcdir)/aes
+
+extern_hdrs += include/aes_gcm.h include/aes_cbc.h include/aes_xts.h include/aes_keyexp.h
+
+lsrc_x86_64     += aes/gcm_multibinary.asm aes/gcm_pre.c
+lsrc_x86_64     += aes/gcm128_avx_gen2.asm aes/gcm128_avx_gen4.asm aes/gcm128_sse.asm
+lsrc_x86_64     += aes/gcm256_avx_gen2.asm aes/gcm256_avx_gen4.asm aes/gcm256_sse.asm
+lsrc_x86_64     += aes/gcm128_vaes_avx512.asm aes/gcm256_vaes_avx512.asm
+lsrc_x86_64     += aes/gcm128_avx_gen2_nt.asm aes/gcm128_avx_gen4_nt.asm aes/gcm128_sse_nt.asm
+lsrc_x86_64     += aes/gcm256_avx_gen2_nt.asm aes/gcm256_avx_gen4_nt.asm aes/gcm256_sse_nt.asm
+lsrc_x86_64     += aes/gcm128_vaes_avx512_nt.asm aes/gcm256_vaes_avx512_nt.asm
+
+lsrc_x86_64     += aes/gcm_multibinary_nt.asm
+
+lsrc_x86_64     += aes/keyexp_multibinary.asm
+lsrc_x86_64     += aes/keyexp_128.asm aes/keyexp_192.asm aes/keyexp_256.asm
+lsrc_x86_64     += aes/cbc_multibinary.asm
+lsrc_x86_64     += aes/cbc_dec_128_x4_sse.asm aes/cbc_dec_128_x8_avx.asm
+lsrc_x86_64     += aes/cbc_dec_192_x4_sse.asm aes/cbc_dec_192_x8_avx.asm
+lsrc_x86_64     += aes/cbc_dec_256_x4_sse.asm aes/cbc_dec_256_x8_avx.asm
+lsrc_x86_64     += aes/cbc_enc_128_x4_sb.asm aes/cbc_enc_128_x8_sb.asm
+lsrc_x86_64     += aes/cbc_enc_192_x4_sb.asm aes/cbc_enc_192_x8_sb.asm
+lsrc_x86_64     += aes/cbc_enc_256_x4_sb.asm aes/cbc_enc_256_x8_sb.asm
+lsrc_x86_64     += aes/cbc_dec_vaes_avx512.asm
+lsrc_x86_64     += aes/cbc_pre.c
+lsrc_x86_64     += aes/xts_aes_128_multibinary.asm
+lsrc_x86_64     += aes/XTS_AES_128_dec_sse.asm aes/XTS_AES_128_dec_expanded_key_sse.asm
+lsrc_x86_64     += aes/XTS_AES_128_enc_sse.asm aes/XTS_AES_128_enc_expanded_key_sse.asm
+lsrc_x86_64     += aes/XTS_AES_128_dec_avx.asm aes/XTS_AES_128_dec_expanded_key_avx.asm
+lsrc_x86_64     += aes/XTS_AES_128_enc_avx.asm aes/XTS_AES_128_enc_expanded_key_avx.asm
+lsrc_x86_64     += aes/xts_aes_256_multibinary.asm
+lsrc_x86_64     += aes/XTS_AES_256_dec_avx.asm aes/XTS_AES_256_dec_expanded_key_avx.asm
+lsrc_x86_64     += aes/XTS_AES_256_enc_avx.asm aes/XTS_AES_256_enc_expanded_key_avx.asm
+lsrc_x86_64     += aes/XTS_AES_256_dec_sse.asm aes/XTS_AES_256_dec_expanded_key_sse.asm
+lsrc_x86_64     += aes/XTS_AES_256_enc_sse.asm aes/XTS_AES_256_enc_expanded_key_sse.asm
+lsrc_x86_64     += aes/XTS_AES_256_enc_vaes.asm
+lsrc_x86_64     += aes/XTS_AES_128_enc_vaes.asm
+lsrc_x86_64     += aes/XTS_AES_256_enc_expanded_key_vaes.asm
+lsrc_x86_64     += aes/XTS_AES_128_enc_expanded_key_vaes.asm
+lsrc_x86_64     += aes/XTS_AES_256_dec_vaes.asm
+lsrc_x86_64     += aes/XTS_AES_128_dec_vaes.asm
+lsrc_x86_64     += aes/XTS_AES_256_dec_expanded_key_vaes.asm
+lsrc_x86_64     += aes/XTS_AES_128_dec_expanded_key_vaes.asm
+
+lsrc_x86_32 	+= $(lsrc_x86_64)
+
+lsrc_aarch64 +=  aes/gcm_pre.c \
+                aes/aarch64/gcm_multibinary_aarch64.S       \
+                aes/aarch64/keyexp_multibinary_aarch64.S    \
+                aes/aarch64/gcm_aarch64_dispatcher.c        \
+                aes/aarch64/keyexp_aarch64_dispatcher.c     \
+                aes/aarch64/keyexp_128_aarch64_aes.S        \
+                aes/aarch64/keyexp_192_aarch64_aes.S        \
+                aes/aarch64/keyexp_256_aarch64_aes.S        \
+                aes/aarch64/aes_gcm_aes_finalize_128.S      \
+                aes/aarch64/aes_gcm_aes_init.S              \
+                aes/aarch64/aes_gcm_enc_dec_128.S           \
+                aes/aarch64/aes_gcm_precomp_128.S           \
+                aes/aarch64/aes_gcm_update_128.S            \
+                aes/aarch64/aes_gcm_aes_finalize_256.S      \
+                aes/aarch64/aes_gcm_consts.S                \
+                aes/aarch64/aes_gcm_enc_dec_256.S           \
+                aes/aarch64/aes_gcm_precomp_256.S           \
+                aes/aarch64/aes_gcm_update_256.S            \
+                aes/aarch64/xts_aarch64_dispatcher.c        \
+                aes/aarch64/xts_aes_128_dec.S               \
+                aes/aarch64/xts_aes_128_enc.S               \
+                aes/aarch64/xts_keyexp_aes_128_dec.S        \
+                aes/aarch64/xts_keyexp_aes_128_enc.S        \
+                aes/aarch64/xts_aes_256_dec.S               \
+                aes/aarch64/xts_aes_256_enc.S               \
+                aes/aarch64/xts_keyexp_aes_256_dec.S        \
+                aes/aarch64/xts_keyexp_aes_256_enc.S        \
+                aes/aarch64/xts_multibinary_aarch64.S       \
+                aes/cbc_pre.c                               \
+                aes/aarch64/cbc_multibinary_aarch64.S       \
+                aes/aarch64/cbc_aarch64_dispatcher.c        \
+                aes/aarch64/cbc_enc_aes.S                   \
+                aes/aarch64/cbc_dec_aes.S
+
+other_src   += include/multibinary.asm
+other_src   += include/test.h include/types.h include/reg_sizes.asm
+other_src   += aes/gcm_defines.asm
+other_src   += aes/aes_common.asm
+other_src   += aes/clear_regs.asm
+other_src   += aes/cbc_common.asm aes/cbc_std_vectors.h
+other_src   += aes/gcm_vectors.h aes/ossl_helper.h
+other_src   += aes/xts_128_vect.h
+other_src   += aes/xts_256_vect.h
+other_src   += aes/gcm_sse.asm
+other_src   += aes/gcm_avx_gen2.asm
+other_src   += aes/gcm_avx_gen4.asm
+other_src   += aes/gcm_keys_vaes_avx512.asm
+other_src   += aes/gcm_vaes_avx512.asm
+
+check_tests += aes/cbc_std_vectors_test
+check_tests += aes/gcm_std_vectors_test
+check_tests += aes/gcm_nt_std_vectors_test
+check_tests += aes/xts_128_test
+check_tests += aes/xts_256_test
+check_tests += aes/xts_128_expanded_key_test
+check_tests += aes/xts_256_expanded_key_test
+
+unit_tests  += aes/cbc_std_vectors_random_test
+unit_tests  += aes/gcm_std_vectors_random_test
+unit_tests  += aes/gcm_nt_rand_test
+unit_tests  += aes/xts_128_rand aes/xts_128_rand_ossl_test
+unit_tests  += aes/xts_256_rand aes/xts_256_rand_ossl_test
+
+perf_tests  += aes/cbc_ossl_perf
+perf_tests  += aes/gcm_ossl_perf
+perf_tests  += aes/xts_128_enc_ossl_perf
+perf_tests  += aes/xts_256_enc_ossl_perf
+perf_tests  += aes/xts_128_enc_perf aes/xts_128_dec_perf aes/xts_128_dec_ossl_perf
+perf_tests  += aes/xts_256_enc_perf aes/xts_256_dec_perf aes/xts_256_dec_ossl_perf
+
+examples += aes/gcm_simple_example
+
+cbc_ossl_perf: LDLIBS += -lcrypto
+aes_cbc_ossl_perf_LDFLAGS = -lcrypto
+cbc_std_vectors_random_test: LDLIBS += -lcrypto
+aes_cbc_std_vectors_random_test_LDFLAGS = -lcrypto
+gcm_ossl_perf: LDLIBS += -lcrypto
+aes_gcm_ossl_perf_LDFLAGS = -lcrypto
+gcm_std_vectors_random_test: LDLIBS += -lcrypto
+aes_gcm_std_vectors_random_test_LDFLAGS = -lcrypto
+gcm_nt_rand_test: LDLIBS += -lcrypto
+aes_gcm_nt_rand_test_LDFLAGS = -lcrypto
+xts_128_enc_ossl_perf: LDLIBS += -lcrypto
+aes_xts_128_enc_ossl_perf_LDFLAGS = -lcrypto
+xts_128_dec_ossl_perf: LDLIBS +=  -lcrypto
+aes_xts_128_dec_ossl_perf_LDFLAGS = -lcrypto
+xts_128_rand_ossl_test: LDLIBS +=  -lcrypto
+aes_xts_128_rand_ossl_test_LDFLAGS = -lcrypto
+xts_256_enc_ossl_perf : LDLIBS += -lcrypto
+aes_xts_256_enc_ossl_perf_LDFLAGS = -lcrypto
+xts_256_dec_ossl_perf : LDLIBS += -lcrypto
+aes_xts_256_dec_ossl_perf_LDFLAGS = -lcrypto
+xts_256_rand_ossl_test: LDLIBS += -lcrypto
+aes_xts_256_rand_ossl_test_LDFLAGS = -lcrypto
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm
new file mode 100644
index 000000000..85582c0df
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm
@@ -0,0 +1,1778 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 128-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*19     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*19     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*29     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_dec_avx(
+;               UINT8 *k2,      // key used for tweaking, 16*1 bytes
+;               UINT8 *k1,      // key used for "ECB" decryption, 16*1 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *ct,        // ciphertext sector input data
+;               UINT8 *pt);     // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define target_ptr_val          rsi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define target_ptr_val          rdx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro  key_expansion_128       3
+%define %%xraw_key      %1
+%define %%xtmp  %2
+%define %%xround_key    %3
+	vpshufd  %%xraw_key,  %%xraw_key, 11111111b
+	vshufps  %%xtmp, %%xround_key, 00010000b
+	vpxor    %%xround_key, %%xtmp
+	vshufps  %%xtmp, %%xround_key, 10001100b
+	vpxor    %%xround_key, %%xtmp
+	vpxor    %%xround_key,  %%xraw_key
+%endmacro
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 9
+%define %%xkey2 %1
+%define %%xstate_tweak  %2
+%define %%xkey1 %3
+%define %%xraw_key      %4
+%define %%xtmp  %5
+%define %%xtmp2 %6
+%define %%ptr_key2      %7
+%define %%ptr_key1      %8
+%define %%ptr_expanded_keys     %9
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2]
+	vmovdqu  %%xkey1, [%%ptr_key1]
+	vmovdqa  [%%ptr_expanded_keys+16*10], %%xkey1
+
+	vpxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x1        ; Generating round key 1 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x1        ; Generating round key 1 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 1 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys + 16*9], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x2        ; Generating round key 2 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x2        ; Generating round key 2 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 2 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys + 16*8], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x4        ; Generating round key 3 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x4        ; Generating round key 3 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 3 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys + 16*7], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x8        ; Generating round key 4 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x8        ; Generating round key 4 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 4 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys + 16*6], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x10       ; Generating round key 5 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x10       ; Generating round key 5 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 5 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys + 16*5], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x20       ; Generating round key 6 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x20       ; Generating round key 6 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 6 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys + 16*4], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x40       ; Generating round key 7 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x40       ; Generating round key 7 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 7 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys + 16*3], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x80       ; Generating round key 8 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x80       ; Generating round key 8 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 8 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys + 16*2], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x1b       ; Generating round key 9 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x1b       ; Generating round key 9 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 9 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys + 16*1], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x36       ; Generating round key 10 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x36       ; Generating round key 10 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenclast              %%xstate_tweak, %%xkey2         ; round 10 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys + 16*0], %%xkey1
+
+	vmovdqa  [TW], %%xstate_tweak            ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		vmovdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		vmovdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		vmovdqa  %%TW2, [TW+16*1]
+		vmovdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		vmovdqa  %%TW3, [TW+16*2]
+		vmovdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		vmovdqa  %%TW4, [TW+16*3]
+		vmovdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		vmovdqa  %%TW5, [TW+16*4]
+		vmovdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		vmovdqa  %%TW6, [TW+16*5]
+		vmovdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		vmovdqa  %%TW7, [TW+16*6]
+		vmovdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; decrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
+; next 8 Tweak values are generated
+%macro  decrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks decrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	vmovdqa  %%T0, [keys]
+	vpxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	vmovdqa  %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	vmovdqa  %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	vmovdqa  %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	vmovdqa  %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	vmovdqa  %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	vmovdqa  %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	vmovdqa  %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	vmovdqa  %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	vmovdqa  %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+
+
+	; round 10
+	vmovdqa  %%T0, [keys + 16*10]
+	vaesdeclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdeclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdeclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdeclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdeclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdeclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdeclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; decrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  decrypt_by_eight 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%TW8   %16     ; tweak 8
+%define %%T0    %17     ; Temp register
+%define %%last_eight     %18
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+	vpxor    %%ST2, %%TW2
+	vpxor    %%ST3, %%TW3
+	vpxor    %%ST4, %%TW4
+	vpxor    %%ST5, %%TW5
+	vpxor    %%ST6, %%TW6
+	vpxor    %%ST7, %%TW7
+	vpxor    %%ST8, %%TW8
+
+	; ARK
+	vmovdqa %%T0, [keys]
+	vpxor    %%ST1, %%T0
+	vpxor    %%ST2, %%T0
+	vpxor    %%ST3, %%T0
+	vpxor    %%ST4, %%T0
+	vpxor    %%ST5, %%T0
+	vpxor    %%ST6, %%T0
+	vpxor    %%ST7, %%T0
+	vpxor    %%ST8, %%T0
+
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 1
+	vmovdqa %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 2
+	vmovdqa %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+
+%endif
+	; round 3
+	vmovdqa %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*2], twtempl
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 4
+	vmovdqa %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl
+%endif
+	; round 5
+	vmovdqa %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 6
+	vmovdqa %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl
+		mov     [TW + 8*7], twtemph
+%endif
+	; round 7
+	vmovdqa %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 8
+	vmovdqa %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl
+		mov     [TW + 8*9], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 9
+	vmovdqa %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+		mov     [TW + 8*10], twtempl
+		mov     [TW + 8*11], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+		mov     [TW + 8*13], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+;               mov     [TW + 8*14], twtempl
+;               mov     [TW + 8*15], twtemph
+%endif
+	; round 10
+	vmovdqa %%T0, [keys + 16*10]
+	vaesdeclast      %%ST1, %%T0
+	vaesdeclast      %%ST2, %%T0
+	vaesdeclast      %%ST3, %%T0
+	vaesdeclast      %%ST4, %%T0
+	vaesdeclast      %%ST5, %%T0
+	vaesdeclast      %%ST6, %%T0
+	vaesdeclast      %%ST7, %%T0
+	vaesdeclast      %%ST8, %%T0
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+	vpxor    %%ST2, %%TW2
+	vpxor    %%ST3, %%TW3
+	vpxor    %%ST4, %%TW4
+	vpxor    %%ST5, %%TW5
+	vpxor    %%ST6, %%TW6
+	vpxor    %%ST7, %%TW7
+	vpxor    %%ST8, %%TW8
+
+		mov     [TW + 8*14], twtempl
+		mov     [TW + 8*15], twtemph
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_dec_avx, function
+XTS_AES_128_dec_avx:
+	endbranch
+
+	sub     rsp, VARIABLE_OFFSET
+
+	mov     [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [_gpr + 8*1], rdi
+	mov     [_gpr + 8*2], rsi
+
+	vmovdqa  [_xmm + 16*0], xmm6
+	vmovdqa  [_xmm + 16*1], xmm7
+	vmovdqa  [_xmm + 16*2], xmm8
+	vmovdqa  [_xmm + 16*3], xmm9
+	vmovdqa  [_xmm + 16*4], xmm10
+	vmovdqa  [_xmm + 16*5], xmm11
+	vmovdqa  [_xmm + 16*6], xmm12
+	vmovdqa  [_xmm + 16*7], xmm13
+	vmovdqa  [_xmm + 16*8], xmm14
+	vmovdqa  [_xmm + 16*9], xmm15
+%endif
+
+	mov     ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	vmovdqu  xmm1, [T_val]                   ; read initial Tweak value
+	vpxor    xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]            ; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]           ; ciphertext pointer
+%endif
+
+
+
+	mov             target_ptr_val, N_val
+	and             target_ptr_val, -16             ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+	sub             target_ptr_val, 128             ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+	jl              _less_than_128_bytes
+
+	add             target_ptr_val, ptr_ciphertext
+
+
+	mov             tmp1, N_val
+	and             tmp1, (7 << 4)
+	jz              _initial_num_blocks_is_0
+
+	cmp             tmp1, (4 << 4)
+	je              _initial_num_blocks_is_4
+
+
+
+	cmp             tmp1, (6 << 4)
+	je              _initial_num_blocks_is_6
+
+	cmp             tmp1, (5 << 4)
+	je              _initial_num_blocks_is_5
+
+
+
+	cmp             tmp1, (3 << 4)
+	je              _initial_num_blocks_is_3
+
+	cmp             tmp1, (2 << 4)
+	je              _initial_num_blocks_is_2
+
+	cmp             tmp1, (1 << 4)
+	je              _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add     ptr_plaintext, 16*7
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	add     ptr_ciphertext, 16*7
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add     ptr_plaintext, 16*6
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	add     ptr_ciphertext, 16*6
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add     ptr_plaintext, 16*5
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	add     ptr_ciphertext, 16*5
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add     ptr_plaintext, 16*4
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	add     ptr_ciphertext, 16*4
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+
+_initial_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add     ptr_plaintext, 16*3
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	add     ptr_ciphertext, 16*3
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add     ptr_plaintext, 16*2
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+	vmovdqu  [ptr_ciphertext+16], xmm2
+	add     ptr_ciphertext, 16*2
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add     ptr_plaintext, 16*1
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+	add     ptr_ciphertext, 16
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_0:
+		mov             twtempl, [TW+8*0]
+		mov             twtemph, [TW+8*1]
+		vmovdqa          xmm9, [TW+16*0]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*2], twtempl
+		mov             [TW+8*3], twtemph
+		vmovdqa          xmm10, [TW+16*1]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*4], twtempl
+		mov             [TW+8*5], twtemph
+		vmovdqa          xmm11, [TW+16*2]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*6], twtempl
+		mov             [TW+8*7], twtemph
+		vmovdqa          xmm12, [TW+16*3]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*8], twtempl
+		mov             [TW+8*9], twtemph
+		vmovdqa          xmm13, [TW+16*4]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*10], twtempl
+		mov             [TW+8*11], twtemph
+		vmovdqa          xmm14, [TW+16*5]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*12], twtempl
+		mov             [TW+8*13], twtemph
+		vmovdqa          xmm15, [TW+16*6]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*14], twtempl
+		mov             [TW+8*15], twtemph
+		;vmovdqa         xmm16, [TW+16*7]
+
+		cmp             ptr_ciphertext, target_ptr_val
+		je              _last_eight
+_main_loop:
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+
+	add     ptr_plaintext, 128
+
+	decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	vmovdqu  [ptr_ciphertext+16*7], xmm8
+	add     ptr_ciphertext, 128
+
+	cmp     ptr_ciphertext, target_ptr_val
+	jne     _main_loop
+
+_last_eight:
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_final
+
+	; generate next Tweak value
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	vmovdqa  xmm1, [TW + 16*7]
+	vmovdqa  [TW + 16*0], xmm1       ; swap tweak values for cipher stealing for decrypt
+
+	mov     [TW + 16*7], twtempl
+	mov     [TW + 16*7+8], twtemph
+
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+	decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	jmp     _steal_cipher
+
+
+_done_final:
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+	decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+
+	jmp      _done
+
+
+_steal_cipher:
+	; start cipher stealing
+
+	vmovdqa  xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea     twtempl, [vpshufb_shf_table]
+	vmovdqu  xmm0, [twtempl+N_val]
+	vpshufb  xmm8, xmm0
+
+
+	vmovdqu  xmm3, [ptr_plaintext + 112 + N_val]      ; state register is temporarily xmm3 to eliminate a move
+	vmovdqu  [ptr_ciphertext + 112 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea     twtempl, [vpshufb_shf_table +16]
+	sub     twtempl, N_val
+	vmovdqu  xmm0, [twtempl]
+	vpxor    xmm0, [mask1]
+	vpshufb  xmm3, xmm0
+
+	vpblendvb       xmm3, xmm3, xmm2, xmm0      ;xmm0 is implicit
+
+	; xor Tweak value
+	vmovdqa  xmm8, [TW]
+	vpxor    xmm8, xmm3      ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+	;decrypt last block with cipher stealing
+	vpxor    xmm8, [keys]                    ; ARK
+	vaesdec  xmm8, [keys + 16*1]             ; round 1
+	vaesdec  xmm8, [keys + 16*2]             ; round 2
+	vaesdec  xmm8, [keys + 16*3]             ; round 3
+	vaesdec  xmm8, [keys + 16*4]             ; round 4
+	vaesdec  xmm8, [keys + 16*5]             ; round 5
+	vaesdec  xmm8, [keys + 16*6]             ; round 6
+	vaesdec  xmm8, [keys + 16*7]             ; round 7
+	vaesdec  xmm8, [keys + 16*8]             ; round 8
+	vaesdec  xmm8, [keys + 16*9]             ; round 9
+	vaesdeclast      xmm8, [keys + 16*10]    ; round 10
+
+	; xor Tweak value
+	vpxor    xmm8, [TW]
+
+_done:
+	; store last ciphertext value
+	vmovdqu  [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+	mov     rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rdi, [_gpr + 8*1]
+	mov     rsi, [_gpr + 8*2]
+
+
+	vmovdqa  xmm6, [_xmm + 16*0]
+	vmovdqa  xmm7, [_xmm + 16*1]
+	vmovdqa  xmm8, [_xmm + 16*2]
+	vmovdqa  xmm9, [_xmm + 16*3]
+	vmovdqa  xmm10, [_xmm + 16*4]
+	vmovdqa  xmm11, [_xmm + 16*5]
+	vmovdqa  xmm12, [_xmm + 16*6]
+	vmovdqa  xmm13, [_xmm + 16*7]
+	vmovdqa  xmm14, [_xmm + 16*8]
+	vmovdqa  xmm15, [_xmm + 16*9]
+%endif
+
+	add     rsp, VARIABLE_OFFSET
+
+	ret
+
+
+
+
+
+_less_than_128_bytes:
+	cmp N_val, 16
+	jb _ret_
+
+	mov     tmp1, N_val
+	and     tmp1, (7 << 4)
+	cmp     tmp1, (6 << 4)
+	je      _num_blocks_is_6
+	cmp     tmp1, (5 << 4)
+	je      _num_blocks_is_5
+	cmp     tmp1, (4 << 4)
+	je      _num_blocks_is_4
+	cmp     tmp1, (3 << 4)
+	je      _num_blocks_is_3
+	cmp     tmp1, (2 << 4)
+	je      _num_blocks_is_2
+	cmp     tmp1, (1 << 4)
+	je      _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+	sub     ptr_plaintext, 16*1
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_7
+
+_steal_cipher_7:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm15
+	vmovdqa  xmm15, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	vmovdqa  xmm8, xmm7
+	jmp     _steal_cipher
+
+_done_7:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	vmovdqa  xmm8, xmm7
+	jmp     _done
+
+
+
+
+
+
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+	sub     ptr_plaintext, 16*2
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_6
+
+_steal_cipher_6:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm14
+	vmovdqa  xmm14, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	vmovdqa  xmm8, xmm6
+	jmp     _steal_cipher
+
+_done_6:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	vmovdqa  xmm8, xmm6
+	jmp     _done
+
+
+
+
+
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+	sub     ptr_plaintext, 16*3
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_5
+
+_steal_cipher_5:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm13
+	vmovdqa  xmm13, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	vmovdqa  xmm8, xmm5
+	jmp     _steal_cipher
+
+_done_5:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	vmovdqa  xmm8, xmm5
+	jmp     _done
+
+
+
+
+
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+	sub     ptr_plaintext, 16*4
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_4
+
+_steal_cipher_4:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm12
+	vmovdqa  xmm12, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	vmovdqa  xmm8, xmm4
+	jmp     _steal_cipher
+
+_done_4:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	vmovdqa  xmm8, xmm4
+	jmp     _done
+
+
+
+
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+	sub     ptr_plaintext, 16*5
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_3
+
+_steal_cipher_3:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm11
+	vmovdqa  xmm11, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	vmovdqa  xmm8, xmm3
+	jmp     _steal_cipher
+
+_done_3:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	vmovdqa  xmm8, xmm3
+	jmp     _done
+
+
+
+
+
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+	sub     ptr_plaintext, 16*6
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_2
+
+_steal_cipher_2:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm10
+	vmovdqa  xmm10, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	vmovdqa  xmm8, xmm2
+	jmp     _steal_cipher
+
+_done_2:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	vmovdqa  xmm8, xmm2
+	jmp     _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+	sub     ptr_plaintext, 16*7
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_1
+
+_steal_cipher_1:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm9
+	vmovdqa  xmm9, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	vmovdqa  xmm8, xmm1
+	jmp     _steal_cipher
+
+_done_1:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	vmovdqa  xmm8, xmm1
+	jmp     _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm
new file mode 100644
index 000000000..faa7e895e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm
@@ -0,0 +1,1748 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 128-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*19     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*19     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*29     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_dec_expanded_key_avx(
+;               UINT8 *k2,      // key used for tweaking, 16*11 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*11 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *ct,        // ciphertext sector input data
+;               UINT8 *pt);     // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define target_ptr_val          rsi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define target_ptr_val          rdx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro  encrypt_T 8
+%define %%xkey2         %1
+%define %%xstate_tweak  %2
+%define %%xkey1         %3
+%define %%xraw_key      %4
+%define %%xtmp          %5
+%define %%ptr_key2      %6
+%define %%ptr_key1      %7
+%define %%ptr_expanded_keys     %8
+
+	vmovdqu  %%xkey2, [%%ptr_key2]
+	vpxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*10]
+	vmovdqa  [%%ptr_expanded_keys+16*10], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*1]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 1 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*9]
+	vmovdqa  [%%ptr_expanded_keys+16*9], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*2]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 2 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*8]
+	vmovdqa  [%%ptr_expanded_keys+16*8], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*3]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 3 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*7]
+	vmovdqa  [%%ptr_expanded_keys+16*7], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*4]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 4 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*6]
+	vmovdqa  [%%ptr_expanded_keys+16*6], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*5]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 5 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*5]
+	vmovdqa  [%%ptr_expanded_keys+16*5], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*6]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 6 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*4]
+	vmovdqa  [%%ptr_expanded_keys+16*4], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*7]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 7 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*3]
+	vmovdqa  [%%ptr_expanded_keys+16*3], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*8]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 8 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*2]
+	vmovdqa  [%%ptr_expanded_keys+16*2], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*9]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 9 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*1]
+	vmovdqa  [%%ptr_expanded_keys+16*1], %%xkey1             ; store round keys in stack
+
+
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*10]
+	vaesenclast      %%xstate_tweak, %%xkey2                 ; round 10 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*0]
+	vmovdqa  [%%ptr_expanded_keys+16*0], %%xkey1            ; store round keys in stack
+
+	vmovdqa  [TW], %%xstate_tweak                            ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		vmovdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		vmovdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		vmovdqa  %%TW2, [TW+16*1]
+		vmovdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		vmovdqa  %%TW3, [TW+16*2]
+		vmovdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		vmovdqa  %%TW4, [TW+16*3]
+		vmovdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		vmovdqa  %%TW5, [TW+16*4]
+		vmovdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		vmovdqa  %%TW6, [TW+16*5]
+		vmovdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		vmovdqa  %%TW7, [TW+16*6]
+		vmovdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro  encrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	vmovdqa  %%T0, [keys]
+	vpxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	vmovdqa  %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	vmovdqa  %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	vmovdqa  %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	vmovdqa  %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	vmovdqa  %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	vmovdqa  %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	vmovdqa  %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	vmovdqa  %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	vmovdqa  %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+
+
+	; round 10
+	vmovdqa  %%T0, [keys + 16*10]
+	vaesdeclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdeclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdeclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdeclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdeclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdeclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdeclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_eight 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%TW8   %16     ; tweak 8
+%define %%T0    %17     ; Temp register
+%define %%last_eight     %18
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+	vpxor    %%ST2, %%TW2
+	vpxor    %%ST3, %%TW3
+	vpxor    %%ST4, %%TW4
+	vpxor    %%ST5, %%TW5
+	vpxor    %%ST6, %%TW6
+	vpxor    %%ST7, %%TW7
+	vpxor    %%ST8, %%TW8
+
+	; ARK
+	vmovdqa %%T0, [keys]
+	vpxor    %%ST1, %%T0
+	vpxor    %%ST2, %%T0
+	vpxor    %%ST3, %%T0
+	vpxor    %%ST4, %%T0
+	vpxor    %%ST5, %%T0
+	vpxor    %%ST6, %%T0
+	vpxor    %%ST7, %%T0
+	vpxor    %%ST8, %%T0
+
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 1
+	vmovdqa %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 2
+	vmovdqa %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+
+%endif
+	; round 3
+	vmovdqa %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*2], twtempl
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 4
+	vmovdqa %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl
+%endif
+	; round 5
+	vmovdqa %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 6
+	vmovdqa %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl
+		mov     [TW + 8*7], twtemph
+%endif
+	; round 7
+	vmovdqa %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 8
+	vmovdqa %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl
+		mov     [TW + 8*9], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 9
+	vmovdqa %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+		mov     [TW + 8*10], twtempl
+		mov     [TW + 8*11], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+		mov     [TW + 8*13], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+;               mov     [TW + 8*14], twtempl
+;               mov     [TW + 8*15], twtemph
+%endif
+	; round 10
+	vmovdqa %%T0, [keys + 16*10]
+	vaesdeclast      %%ST1, %%T0
+	vaesdeclast      %%ST2, %%T0
+	vaesdeclast      %%ST3, %%T0
+	vaesdeclast      %%ST4, %%T0
+	vaesdeclast      %%ST5, %%T0
+	vaesdeclast      %%ST6, %%T0
+	vaesdeclast      %%ST7, %%T0
+	vaesdeclast      %%ST8, %%T0
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+	vpxor    %%ST2, %%TW2
+	vpxor    %%ST3, %%TW3
+	vpxor    %%ST4, %%TW4
+	vpxor    %%ST5, %%TW5
+	vpxor    %%ST6, %%TW6
+	vpxor    %%ST7, %%TW7
+	vpxor    %%ST8, %%TW8
+
+		mov     [TW + 8*14], twtempl
+		mov     [TW + 8*15], twtemph
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_dec_expanded_key_avx, function
+XTS_AES_128_dec_expanded_key_avx:
+	endbranch
+
+	sub     rsp, VARIABLE_OFFSET
+
+	mov     [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [_gpr + 8*1], rdi
+	mov     [_gpr + 8*2], rsi
+
+	vmovdqa  [_xmm + 16*0], xmm6
+	vmovdqa  [_xmm + 16*1], xmm7
+	vmovdqa  [_xmm + 16*2], xmm8
+	vmovdqa  [_xmm + 16*3], xmm9
+	vmovdqa  [_xmm + 16*4], xmm10
+	vmovdqa  [_xmm + 16*5], xmm11
+	vmovdqa  [_xmm + 16*6], xmm12
+	vmovdqa  [_xmm + 16*7], xmm13
+	vmovdqa  [_xmm + 16*8], xmm14
+	vmovdqa  [_xmm + 16*9], xmm15
+%endif
+
+	mov     ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	vmovdqu  xmm1, [T_val]                   ; read initial Tweak value
+	vpxor    xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]            ; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]           ; ciphertext pointer
+%endif
+
+
+
+	mov             target_ptr_val, N_val
+	and             target_ptr_val, -16             ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+	sub             target_ptr_val, 128             ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+	jl              _less_than_128_bytes
+
+	add             target_ptr_val, ptr_ciphertext
+
+
+	mov             tmp1, N_val
+	and             tmp1, (7 << 4)
+	jz              _initial_num_blocks_is_0
+
+	cmp             tmp1, (4 << 4)
+	je              _initial_num_blocks_is_4
+
+
+
+	cmp             tmp1, (6 << 4)
+	je              _initial_num_blocks_is_6
+
+	cmp             tmp1, (5 << 4)
+	je              _initial_num_blocks_is_5
+
+
+
+	cmp             tmp1, (3 << 4)
+	je              _initial_num_blocks_is_3
+
+	cmp             tmp1, (2 << 4)
+	je              _initial_num_blocks_is_2
+
+	cmp             tmp1, (1 << 4)
+	je              _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	add     ptr_ciphertext, 16*7
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	add     ptr_ciphertext, 16*6
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	add     ptr_ciphertext, 16*5
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	add     ptr_ciphertext, 16*4
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+
+_initial_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	add     ptr_ciphertext, 16*3
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+	vmovdqu  [ptr_ciphertext+16], xmm2
+	add     ptr_ciphertext, 16*2
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+	add     ptr_ciphertext, 16
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_0:
+		mov             twtempl, [TW+8*0]
+		mov             twtemph, [TW+8*1]
+		vmovdqa          xmm9, [TW+16*0]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*2], twtempl
+		mov             [TW+8*3], twtemph
+		vmovdqa          xmm10, [TW+16*1]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*4], twtempl
+		mov             [TW+8*5], twtemph
+		vmovdqa          xmm11, [TW+16*2]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*6], twtempl
+		mov             [TW+8*7], twtemph
+		vmovdqa          xmm12, [TW+16*3]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*8], twtempl
+		mov             [TW+8*9], twtemph
+		vmovdqa          xmm13, [TW+16*4]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*10], twtempl
+		mov             [TW+8*11], twtemph
+		vmovdqa          xmm14, [TW+16*5]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*12], twtempl
+		mov             [TW+8*13], twtemph
+		vmovdqa          xmm15, [TW+16*6]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*14], twtempl
+		mov             [TW+8*15], twtemph
+		;vmovdqa         xmm16, [TW+16*7]
+
+		cmp             ptr_ciphertext, target_ptr_val
+		je              _last_eight
+_main_loop:
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+
+	add     ptr_plaintext, 128
+
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	vmovdqu  [ptr_ciphertext+16*7], xmm8
+	add     ptr_ciphertext, 128
+
+	cmp     ptr_ciphertext, target_ptr_val
+	jne     _main_loop
+
+_last_eight:
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_final
+
+	; generate next Tweak value
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	vmovdqa  xmm1, [TW + 16*7]
+	vmovdqa  [TW + 16*0], xmm1       ; swap tweak values for cipher stealing for decrypt
+
+	mov     [TW + 16*7], twtempl
+	mov     [TW + 16*7+8], twtemph
+
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	jmp     _steal_cipher
+
+
+_done_final:
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+
+	jmp      _done
+
+
+_steal_cipher:
+	; start cipher stealing
+
+	vmovdqa  xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea     twtempl, [vpshufb_shf_table]
+	vmovdqu  xmm0, [twtempl+N_val]
+	vpshufb  xmm8, xmm0
+
+
+	vmovdqu  xmm3, [ptr_plaintext + 112 + N_val]      ; state register is temporarily xmm3 to eliminate a move
+	vmovdqu  [ptr_ciphertext + 112 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea     twtempl, [vpshufb_shf_table +16]
+	sub     twtempl, N_val
+	vmovdqu  xmm0, [twtempl]
+	vpxor    xmm0, [mask1]
+	vpshufb  xmm3, xmm0
+
+	vpblendvb       xmm3, xmm3, xmm2, xmm0      ;xmm0 is implicit
+
+	; xor Tweak value
+	vmovdqa  xmm8, [TW]
+	vpxor    xmm8, xmm3      ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+	;encrypt last block with cipher stealing
+	vpxor    xmm8, [keys]                    ; ARK
+	vaesdec  xmm8, [keys + 16*1]             ; round 1
+	vaesdec  xmm8, [keys + 16*2]             ; round 2
+	vaesdec  xmm8, [keys + 16*3]             ; round 3
+	vaesdec  xmm8, [keys + 16*4]             ; round 4
+	vaesdec  xmm8, [keys + 16*5]             ; round 5
+	vaesdec  xmm8, [keys + 16*6]             ; round 6
+	vaesdec  xmm8, [keys + 16*7]             ; round 7
+	vaesdec  xmm8, [keys + 16*8]             ; round 8
+	vaesdec  xmm8, [keys + 16*9]             ; round 9
+	vaesdeclast      xmm8, [keys + 16*10]    ; round 10
+
+	; xor Tweak value
+	vpxor    xmm8, [TW]
+
+_done:
+	; store last ciphertext value
+	vmovdqu  [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+	mov     rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rdi, [_gpr + 8*1]
+	mov     rsi, [_gpr + 8*2]
+
+
+	vmovdqa  xmm6, [_xmm + 16*0]
+	vmovdqa  xmm7, [_xmm + 16*1]
+	vmovdqa  xmm8, [_xmm + 16*2]
+	vmovdqa  xmm9, [_xmm + 16*3]
+	vmovdqa  xmm10, [_xmm + 16*4]
+	vmovdqa  xmm11, [_xmm + 16*5]
+	vmovdqa  xmm12, [_xmm + 16*6]
+	vmovdqa  xmm13, [_xmm + 16*7]
+	vmovdqa  xmm14, [_xmm + 16*8]
+	vmovdqa  xmm15, [_xmm + 16*9]
+%endif
+
+	add     rsp, VARIABLE_OFFSET
+
+	ret
+
+
+
+
+
+_less_than_128_bytes:
+	cmp N_val, 16
+	jb _ret_
+
+	mov     tmp1, N_val
+	and     tmp1, (7 << 4)
+	cmp     tmp1, (6 << 4)
+	je      _num_blocks_is_6
+	cmp     tmp1, (5 << 4)
+	je      _num_blocks_is_5
+	cmp     tmp1, (4 << 4)
+	je      _num_blocks_is_4
+	cmp     tmp1, (3 << 4)
+	je      _num_blocks_is_3
+	cmp     tmp1, (2 << 4)
+	je      _num_blocks_is_2
+	cmp     tmp1, (1 << 4)
+	je      _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+	sub     ptr_plaintext, 16*1
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_7
+
+_steal_cipher_7:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm15
+	vmovdqa  xmm15, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	vmovdqa  xmm8, xmm7
+	jmp     _steal_cipher
+
+_done_7:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	vmovdqa  xmm8, xmm7
+	jmp     _done
+
+
+
+
+
+
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+	sub     ptr_plaintext, 16*2
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_6
+
+_steal_cipher_6:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm14
+	vmovdqa  xmm14, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	vmovdqa  xmm8, xmm6
+	jmp     _steal_cipher
+
+_done_6:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	vmovdqa  xmm8, xmm6
+	jmp     _done
+
+
+
+
+
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+	sub     ptr_plaintext, 16*3
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_5
+
+_steal_cipher_5:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm13
+	vmovdqa  xmm13, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	vmovdqa  xmm8, xmm5
+	jmp     _steal_cipher
+
+_done_5:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	vmovdqa  xmm8, xmm5
+	jmp     _done
+
+
+
+
+
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+	sub     ptr_plaintext, 16*4
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_4
+
+_steal_cipher_4:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm12
+	vmovdqa  xmm12, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	vmovdqa  xmm8, xmm4
+	jmp     _steal_cipher
+
+_done_4:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	vmovdqa  xmm8, xmm4
+	jmp     _done
+
+
+
+
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+	sub     ptr_plaintext, 16*5
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_3
+
+_steal_cipher_3:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm11
+	vmovdqa  xmm11, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	vmovdqa  xmm8, xmm3
+	jmp     _steal_cipher
+
+_done_3:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	vmovdqa  xmm8, xmm3
+	jmp     _done
+
+
+
+
+
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+	sub     ptr_plaintext, 16*6
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_2
+
+_steal_cipher_2:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm10
+	vmovdqa  xmm10, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	vmovdqa  xmm8, xmm2
+	jmp     _steal_cipher
+
+_done_2:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	vmovdqa  xmm8, xmm2
+	jmp     _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+	sub     ptr_plaintext, 16*7
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_1
+
+_steal_cipher_1:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm9
+	vmovdqa  xmm9, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	vmovdqa  xmm8, xmm1
+	jmp     _steal_cipher
+
+_done_1:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	vmovdqa  xmm8, xmm1
+	jmp     _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm
new file mode 100644
index 000000000..0b1b637be
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm
@@ -0,0 +1,1747 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 128-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*19     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*19     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*29     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_dec_expanded_key_sse(
+;               UINT8 *k2,      // key used for tweaking, 16*11 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*11 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *ct,        // ciphertext sector input data
+;               UINT8 *pt);     // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define target_ptr_val          rsi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define target_ptr_val          rdx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro  encrypt_T 8
+%define %%xkey2         %1
+%define %%xstate_tweak  %2
+%define %%xkey1         %3
+%define %%xraw_key      %4
+%define %%xtmp          %5
+%define %%ptr_key2      %6
+%define %%ptr_key1      %7
+%define %%ptr_expanded_keys     %8
+
+	movdqu  %%xkey2, [%%ptr_key2]
+	pxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*10]
+	movdqa  [%%ptr_expanded_keys+16*10], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*1]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 1 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*9]
+	movdqa  [%%ptr_expanded_keys+16*9], %%xkey1             ; store round keys in stack
+
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*2]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 2 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*8]
+	movdqa  [%%ptr_expanded_keys+16*8], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*3]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 3 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*7]
+	movdqa  [%%ptr_expanded_keys+16*7], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*4]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 4 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*6]
+	movdqa  [%%ptr_expanded_keys+16*6], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*5]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 5 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*5]
+	movdqa  [%%ptr_expanded_keys+16*5], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*6]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 6 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*4]
+	movdqa  [%%ptr_expanded_keys+16*4], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*7]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 7 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*3]
+	movdqa  [%%ptr_expanded_keys+16*3], %%xkey1             ; store round keys in stack
+
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*8]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 8 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*2]
+	movdqa  [%%ptr_expanded_keys+16*2], %%xkey1             ; store round keys in stack
+
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*9]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 9 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*1]
+	movdqa  [%%ptr_expanded_keys+16*1], %%xkey1             ; store round keys in stack
+
+
+
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*10]
+	aesenclast      %%xstate_tweak, %%xkey2                 ; round 10 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*0]
+	movdqa  [%%ptr_expanded_keys+16*0], %%xkey1            ; store round keys in stack
+
+	movdqa  [TW], %%xstate_tweak                            ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		movdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		movdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		movdqa  %%TW2, [TW+16*1]
+		movdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		movdqa  %%TW3, [TW+16*2]
+		movdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		movdqa  %%TW4, [TW+16*3]
+		movdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		movdqa  %%TW5, [TW+16*4]
+		movdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		movdqa  %%TW6, [TW+16*5]
+		movdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		movdqa  %%TW7, [TW+16*6]
+		movdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro  encrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	pxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	movdqa  %%T0, [keys]
+	pxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	movdqa  %%T0, [keys + 16*1]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	movdqa  %%T0, [keys + 16*2]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	movdqa  %%T0, [keys + 16*3]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	movdqa  %%T0, [keys + 16*4]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	movdqa  %%T0, [keys + 16*5]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	movdqa  %%T0, [keys + 16*6]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	movdqa  %%T0, [keys + 16*7]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	movdqa  %%T0, [keys + 16*8]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	movdqa  %%T0, [keys + 16*9]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+
+
+	; round 10
+	movdqa  %%T0, [keys + 16*10]
+	aesdeclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdeclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdeclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdeclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdeclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdeclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdeclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		movdqa  %%TW1, [TW + 16*0]
+		movdqa  %%TW2, [TW + 16*1]
+		movdqa  %%TW3, [TW + 16*2]
+		movdqa  %%TW4, [TW + 16*3]
+		movdqa  %%TW5, [TW + 16*4]
+		movdqa  %%TW6, [TW + 16*5]
+		movdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_eight 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%TW8   %16     ; tweak 8
+%define %%T0    %17     ; Temp register
+%define %%last_eight     %18
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+	pxor    %%ST2, %%TW2
+	pxor    %%ST3, %%TW3
+	pxor    %%ST4, %%TW4
+	pxor    %%ST5, %%TW5
+	pxor    %%ST6, %%TW6
+	pxor    %%ST7, %%TW7
+	pxor    %%ST8, %%TW8
+
+	; ARK
+	movdqa %%T0, [keys]
+	pxor    %%ST1, %%T0
+	pxor    %%ST2, %%T0
+	pxor    %%ST3, %%T0
+	pxor    %%ST4, %%T0
+	pxor    %%ST5, %%T0
+	pxor    %%ST6, %%T0
+	pxor    %%ST7, %%T0
+	pxor    %%ST8, %%T0
+
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 1
+	movdqa %%T0, [keys + 16*1]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 2
+	movdqa %%T0, [keys + 16*2]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+
+%endif
+	; round 3
+	movdqa %%T0, [keys + 16*3]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*2], twtempl
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 4
+	movdqa %%T0, [keys + 16*4]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl
+%endif
+	; round 5
+	movdqa %%T0, [keys + 16*5]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 6
+	movdqa %%T0, [keys + 16*6]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl
+		mov     [TW + 8*7], twtemph
+%endif
+	; round 7
+	movdqa %%T0, [keys + 16*7]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 8
+	movdqa %%T0, [keys + 16*8]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl
+		mov     [TW + 8*9], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 9
+	movdqa %%T0, [keys + 16*9]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+		mov     [TW + 8*10], twtempl
+		mov     [TW + 8*11], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+		mov     [TW + 8*13], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+;               mov     [TW + 8*14], twtempl
+;               mov     [TW + 8*15], twtemph
+%endif
+	; round 10
+	movdqa %%T0, [keys + 16*10]
+	aesdeclast      %%ST1, %%T0
+	aesdeclast      %%ST2, %%T0
+	aesdeclast      %%ST3, %%T0
+	aesdeclast      %%ST4, %%T0
+	aesdeclast      %%ST5, %%T0
+	aesdeclast      %%ST6, %%T0
+	aesdeclast      %%ST7, %%T0
+	aesdeclast      %%ST8, %%T0
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+	pxor    %%ST2, %%TW2
+	pxor    %%ST3, %%TW3
+	pxor    %%ST4, %%TW4
+	pxor    %%ST5, %%TW5
+	pxor    %%ST6, %%TW6
+	pxor    %%ST7, %%TW7
+	pxor    %%ST8, %%TW8
+
+		mov     [TW + 8*14], twtempl
+		mov     [TW + 8*15], twtemph
+		; load next Tweak values
+		movdqa  %%TW1, [TW + 16*0]
+		movdqa  %%TW2, [TW + 16*1]
+		movdqa  %%TW3, [TW + 16*2]
+		movdqa  %%TW4, [TW + 16*3]
+		movdqa  %%TW5, [TW + 16*4]
+		movdqa  %%TW6, [TW + 16*5]
+		movdqa  %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_dec_expanded_key_sse, function
+XTS_AES_128_dec_expanded_key_sse:
+	endbranch
+
+	sub     rsp, VARIABLE_OFFSET
+
+	mov     [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [_gpr + 8*1], rdi
+	mov     [_gpr + 8*2], rsi
+
+	movdqa  [_xmm + 16*0], xmm6
+	movdqa  [_xmm + 16*1], xmm7
+	movdqa  [_xmm + 16*2], xmm8
+	movdqa  [_xmm + 16*3], xmm9
+	movdqa  [_xmm + 16*4], xmm10
+	movdqa  [_xmm + 16*5], xmm11
+	movdqa  [_xmm + 16*6], xmm12
+	movdqa  [_xmm + 16*7], xmm13
+	movdqa  [_xmm + 16*8], xmm14
+	movdqa  [_xmm + 16*9], xmm15
+%endif
+
+	mov     ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	movdqu  xmm1, [T_val]                   ; read initial Tweak value
+	pxor    xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]            ; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]           ; ciphertext pointer
+%endif
+
+
+
+	mov             target_ptr_val, N_val
+	and             target_ptr_val, -16             ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+	sub             target_ptr_val, 128             ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+	jl              _less_than_128_bytes
+
+	add             target_ptr_val, ptr_ciphertext
+
+
+	mov             tmp1, N_val
+	and             tmp1, (7 << 4)
+	jz              _initial_num_blocks_is_0
+
+	cmp             tmp1, (4 << 4)
+	je              _initial_num_blocks_is_4
+
+
+
+	cmp             tmp1, (6 << 4)
+	je              _initial_num_blocks_is_6
+
+	cmp             tmp1, (5 << 4)
+	je              _initial_num_blocks_is_5
+
+
+
+	cmp             tmp1, (3 << 4)
+	je              _initial_num_blocks_is_3
+
+	cmp             tmp1, (2 << 4)
+	je              _initial_num_blocks_is_2
+
+	cmp             tmp1, (1 << 4)
+	je              _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	add     ptr_ciphertext, 16*7
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	add     ptr_ciphertext, 16*6
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	add     ptr_ciphertext, 16*5
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	add     ptr_ciphertext, 16*4
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+
+_initial_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	add     ptr_ciphertext, 16*3
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+	movdqu  [ptr_ciphertext+16], xmm2
+	add     ptr_ciphertext, 16*2
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+	add     ptr_ciphertext, 16
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_0:
+		mov             twtempl, [TW+8*0]
+		mov             twtemph, [TW+8*1]
+		movdqa          xmm9, [TW+16*0]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*2], twtempl
+		mov             [TW+8*3], twtemph
+		movdqa          xmm10, [TW+16*1]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*4], twtempl
+		mov             [TW+8*5], twtemph
+		movdqa          xmm11, [TW+16*2]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*6], twtempl
+		mov             [TW+8*7], twtemph
+		movdqa          xmm12, [TW+16*3]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*8], twtempl
+		mov             [TW+8*9], twtemph
+		movdqa          xmm13, [TW+16*4]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*10], twtempl
+		mov             [TW+8*11], twtemph
+		movdqa          xmm14, [TW+16*5]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*12], twtempl
+		mov             [TW+8*13], twtemph
+		movdqa          xmm15, [TW+16*6]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*14], twtempl
+		mov             [TW+8*15], twtemph
+		;movdqa         xmm16, [TW+16*7]
+
+		cmp             ptr_ciphertext, target_ptr_val
+		je              _last_eight
+_main_loop:
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+
+	add     ptr_plaintext, 128
+
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	movdqu  [ptr_ciphertext+16*7], xmm8
+	add     ptr_ciphertext, 128
+
+	cmp     ptr_ciphertext, target_ptr_val
+	jne     _main_loop
+
+_last_eight:
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_final
+
+	; generate next Tweak value
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	movdqa  xmm1, [TW + 16*7]
+	movdqa  [TW + 16*0], xmm1       ; swap tweak values for cipher stealing for decrypt
+
+	mov     [TW + 16*7], twtempl
+	mov     [TW + 16*7+8], twtemph
+
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	jmp     _steal_cipher
+
+
+_done_final:
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+
+	jmp      _done
+
+
+_steal_cipher:
+	; start cipher stealing
+
+	movdqa  xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea     twtempl, [pshufb_shf_table]
+	movdqu  xmm0, [twtempl+N_val]
+	pshufb  xmm8, xmm0
+
+
+	movdqu  xmm3, [ptr_plaintext + 112 + N_val]      ; state register is temporarily xmm3 to eliminate a move
+	movdqu  [ptr_ciphertext + 112 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea     twtempl, [pshufb_shf_table +16]
+	sub     twtempl, N_val
+	movdqu  xmm0, [twtempl]
+	pxor    xmm0, [mask1]
+	pshufb  xmm3, xmm0
+
+	pblendvb        xmm3, xmm2      ;xmm0 is implicit
+
+	; xor Tweak value
+	movdqa  xmm8, [TW]
+	pxor    xmm8, xmm3      ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+	;encrypt last block with cipher stealing
+	pxor    xmm8, [keys]                    ; ARK
+	aesdec  xmm8, [keys + 16*1]             ; round 1
+	aesdec  xmm8, [keys + 16*2]             ; round 2
+	aesdec  xmm8, [keys + 16*3]             ; round 3
+	aesdec  xmm8, [keys + 16*4]             ; round 4
+	aesdec  xmm8, [keys + 16*5]             ; round 5
+	aesdec  xmm8, [keys + 16*6]             ; round 6
+	aesdec  xmm8, [keys + 16*7]             ; round 7
+	aesdec  xmm8, [keys + 16*8]             ; round 8
+	aesdec  xmm8, [keys + 16*9]             ; round 9
+	aesdeclast      xmm8, [keys + 16*10]    ; round 10
+
+	; xor Tweak value
+	pxor    xmm8, [TW]
+
+_done:
+	; store last ciphertext value
+	movdqu  [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+	mov     rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rdi, [_gpr + 8*1]
+	mov     rsi, [_gpr + 8*2]
+
+
+	movdqa  xmm6, [_xmm + 16*0]
+	movdqa  xmm7, [_xmm + 16*1]
+	movdqa  xmm8, [_xmm + 16*2]
+	movdqa  xmm9, [_xmm + 16*3]
+	movdqa  xmm10, [_xmm + 16*4]
+	movdqa  xmm11, [_xmm + 16*5]
+	movdqa  xmm12, [_xmm + 16*6]
+	movdqa  xmm13, [_xmm + 16*7]
+	movdqa  xmm14, [_xmm + 16*8]
+	movdqa  xmm15, [_xmm + 16*9]
+%endif
+
+	add     rsp, VARIABLE_OFFSET
+
+	ret
+
+
+
+
+
+_less_than_128_bytes:
+	cmp N_val, 16
+	jb _ret_
+
+	mov     tmp1, N_val
+	and     tmp1, (7 << 4)
+	cmp     tmp1, (6 << 4)
+	je      _num_blocks_is_6
+	cmp     tmp1, (5 << 4)
+	je      _num_blocks_is_5
+	cmp     tmp1, (4 << 4)
+	je      _num_blocks_is_4
+	cmp     tmp1, (3 << 4)
+	je      _num_blocks_is_3
+	cmp     tmp1, (2 << 4)
+	je      _num_blocks_is_2
+	cmp     tmp1, (1 << 4)
+	je      _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+	sub     ptr_plaintext, 16*1
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_7
+
+_steal_cipher_7:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm15
+	movdqa  xmm15, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	movdqa  xmm8, xmm7
+	jmp     _steal_cipher
+
+_done_7:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	movdqa  xmm8, xmm7
+	jmp     _done
+
+
+
+
+
+
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+	sub     ptr_plaintext, 16*2
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_6
+
+_steal_cipher_6:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm14
+	movdqa  xmm14, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	movdqa  xmm8, xmm6
+	jmp     _steal_cipher
+
+_done_6:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	movdqa  xmm8, xmm6
+	jmp     _done
+
+
+
+
+
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+	sub     ptr_plaintext, 16*3
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_5
+
+_steal_cipher_5:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm13
+	movdqa  xmm13, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	movdqa  xmm8, xmm5
+	jmp     _steal_cipher
+
+_done_5:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	movdqa  xmm8, xmm5
+	jmp     _done
+
+
+
+
+
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+	sub     ptr_plaintext, 16*4
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_4
+
+_steal_cipher_4:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm12
+	movdqa  xmm12, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	movdqa  xmm8, xmm4
+	jmp     _steal_cipher
+
+_done_4:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	movdqa  xmm8, xmm4
+	jmp     _done
+
+
+
+
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+	sub     ptr_plaintext, 16*5
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_3
+
+_steal_cipher_3:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm11
+	movdqa  xmm11, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	movdqa  xmm8, xmm3
+	jmp     _steal_cipher
+
+_done_3:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	movdqa  xmm8, xmm3
+	jmp     _done
+
+
+
+
+
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+	sub     ptr_plaintext, 16*6
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_2
+
+_steal_cipher_2:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm10
+	movdqa  xmm10, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	movdqa  xmm8, xmm2
+	jmp     _steal_cipher
+
+_done_2:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	movdqa  xmm8, xmm2
+	jmp     _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+	sub     ptr_plaintext, 16*7
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_1
+
+_steal_cipher_1:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm9
+	movdqa  xmm9, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	movdqa  xmm8, xmm1
+	jmp     _steal_cipher
+
+_done_1:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	movdqa  xmm8, xmm1
+	jmp     _done
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_vaes.asm
new file mode 100644
index 000000000..7f243949a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_vaes.asm
@@ -0,0 +1,1648 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; expanded keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*23     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*23     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*33     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_avx(
+;               UINT8 *k2,      // key used for tweaking, 16*2 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*2 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *pt,        // plaintext sector input data
+;               UINT8 *ct);     // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+%define zpoly   zmm25
+
+; macro to encrypt the tweak value
+
+%macro  encrypt_T 8
+%define %%xkey2         %1
+%define %%xstate_tweak  %2
+%define %%xkey1         %3
+%define %%xraw_key      %4
+%define %%xtmp          %5
+%define %%ptr_key2      %6
+%define %%ptr_key1      %7
+%define %%ptr_expanded_keys     %8
+
+	vmovdqu  %%xkey2, [%%ptr_key2]
+	vpxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*10]
+	vmovdqa  [%%ptr_expanded_keys+16*10], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*1]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 1 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*9]
+	vmovdqa  [%%ptr_expanded_keys+16*9], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*2]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 2 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*8]
+	vmovdqa  [%%ptr_expanded_keys+16*8], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*3]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 3 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*7]
+	vmovdqa  [%%ptr_expanded_keys+16*7], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*4]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 4 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*6]
+	vmovdqa  [%%ptr_expanded_keys+16*6], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*5]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 5 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*5]
+	vmovdqa  [%%ptr_expanded_keys+16*5], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*6]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 6 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*4]
+	vmovdqa  [%%ptr_expanded_keys+16*4], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*7]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 7 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*3]
+	vmovdqa  [%%ptr_expanded_keys+16*3], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*8]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 8 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*2]
+	vmovdqa  [%%ptr_expanded_keys+16*2], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*9]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 9 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*1]
+	vmovdqa  [%%ptr_expanded_keys+16*1], %%xkey1             ; store round keys in stack
+
+
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*10]
+	vaesenclast      %%xstate_tweak, %%xkey2                 ; round 10 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*0]
+	vmovdqa  [%%ptr_expanded_keys+16*0], %%xkey1            ; store round keys in stack
+
+	vmovdqa  [TW], %%xstate_tweak                            ; Store the encrypted Tweak value
+%endmacro
+
+
+; Original way to generate initial tweak values and load plaintext values
+; only used for small blocks
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		vmovdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		vmovdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		vmovdqa  %%TW2, [TW+16*1]
+		vmovdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		vmovdqa  %%TW3, [TW+16*2]
+		vmovdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		vmovdqa  %%TW4, [TW+16*3]
+		vmovdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		vmovdqa  %%TW5, [TW+16*4]
+		vmovdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		vmovdqa  %%TW6, [TW+16*5]
+		vmovdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		vmovdqa  %%TW7, [TW+16*6]
+		vmovdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+%endmacro
+
+
+; Original decrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
+; next 8 Tweak values can be generated
+%macro  decrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks decrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	vmovdqa  %%T0, [keys]
+	vpxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	vmovdqa  %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	vmovdqa  %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	vmovdqa  %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	vmovdqa  %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	vmovdqa  %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	vmovdqa  %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	vmovdqa  %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	vmovdqa  %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	vmovdqa  %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+
+	; round 10
+	vmovdqa  %%T0, [keys + 16*10]
+	vaesdeclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdeclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdeclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdeclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdeclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdeclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdeclast      %%ST7, %%T0
+%endif
+
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+
+; Decrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  decrypt_by_eight_zmm 6
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%TW1   %3      ; tweak 1
+%define %%TW2   %4      ; tweak 2
+%define %%T0    %5     ; Temp register
+%define %%last_eight     %6
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+
+	; ARK
+	vbroadcasti32x4 %%T0, [keys]
+	vpxorq    %%ST1, %%T0
+	vpxorq    %%ST2, %%T0
+
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW1, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm15, %%TW1, 1
+		vpxord		zmm15, zmm15, zmm14
+%endif
+	; round 1
+	vbroadcasti32x4 %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 2
+	vbroadcasti32x4 %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 3
+	vbroadcasti32x4 %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW2, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm16, %%TW2, 1
+		vpxord		zmm16, zmm16, zmm14
+%endif
+	; round 4
+	vbroadcasti32x4 %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 5
+	vbroadcasti32x4 %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 6
+	vbroadcasti32x4 %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 7
+	vbroadcasti32x4 %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 8
+	vbroadcasti32x4 %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 9
+	vbroadcasti32x4 %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 10
+	vbroadcasti32x4 %%T0, [keys + 16*10]
+	vaesdeclast  %%ST1, %%T0
+	vaesdeclast  %%ST2, %%T0
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+
+	; load next Tweak values
+	vmovdqa32  %%TW1, zmm15
+	vmovdqa32  %%TW2, zmm16
+%endmacro
+
+
+; Decrypt 16 blocks in parallel
+; generate next 8 tweak values
+%macro  decrypt_by_16_zmm 10
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+
+%define %%TW1   %5      ; tweak 1
+%define %%TW2   %6      ; tweak 2
+%define %%TW3   %7      ; tweak 3
+%define %%TW4   %8      ; tweak 4
+
+%define %%T0    %9     ; Temp register
+%define %%last_eight     %10
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+	vpxorq    %%ST3, %%TW3
+	vpxorq    %%ST4, %%TW4
+
+	; ARK
+	vbroadcasti32x4 %%T0, [keys]
+	vpxorq    %%ST1, %%T0
+	vpxorq    %%ST2, %%T0
+	vpxorq    %%ST3, %%T0
+	vpxorq    %%ST4, %%T0
+
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW3, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm15, %%TW3, 1
+		vpxord		zmm15, zmm15, zmm14
+%endif
+	; round 1
+	vbroadcasti32x4 %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 2
+	vbroadcasti32x4 %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 3
+	vbroadcasti32x4 %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW4, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm16, %%TW4, 1
+		vpxord		zmm16, zmm16, zmm14
+%endif
+	; round 4
+	vbroadcasti32x4 %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 5
+	vbroadcasti32x4 %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 6
+	vbroadcasti32x4 %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, zmm15, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm17, zmm15, 1
+		vpxord		zmm17, zmm17, zmm14
+%endif
+	; round 7
+	vbroadcasti32x4 %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 8
+	vbroadcasti32x4 %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 9
+	vbroadcasti32x4 %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, zmm16, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm18, zmm16, 1
+		vpxord		zmm18, zmm18, zmm14
+%endif
+	; round 10
+	vbroadcasti32x4 %%T0, [keys + 16*10]
+	vaesdeclast  %%ST1, %%T0
+	vaesdeclast  %%ST2, %%T0
+	vaesdeclast  %%ST3, %%T0
+	vaesdeclast  %%ST4, %%T0
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+	vpxorq    %%ST3, %%TW3
+	vpxorq    %%ST4, %%TW4
+
+	; load next Tweak values
+	vmovdqa32  %%TW1, zmm15
+	vmovdqa32  %%TW2, zmm16
+	vmovdqa32  %%TW3, zmm17
+	vmovdqa32  %%TW4, zmm18
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_dec_expanded_key_vaes, function
+XTS_AES_128_dec_expanded_key_vaes:
+	endbranch
+
+%define ALIGN_STACK
+%ifdef ALIGN_STACK
+	push		rbp
+	mov		rbp, rsp
+	sub		rsp, VARIABLE_OFFSET
+	and		rsp, ~63
+%else
+	sub		rsp, VARIABLE_OFFSET
+%endif
+
+	mov		[_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov		[_gpr + 8*1], rdi
+	mov		[_gpr + 8*2], rsi
+
+	vmovdqa		[_xmm + 16*0], xmm6
+	vmovdqa		[_xmm + 16*1], xmm7
+	vmovdqa		[_xmm + 16*2], xmm8
+	vmovdqa		[_xmm + 16*3], xmm9
+	vmovdqa		[_xmm + 16*4], xmm10
+	vmovdqa		[_xmm + 16*5], xmm11
+	vmovdqa		[_xmm + 16*6], xmm12
+	vmovdqa		[_xmm + 16*7], xmm13
+	vmovdqa		[_xmm + 16*8], xmm14
+	vmovdqa		[_xmm + 16*9], xmm15
+%endif
+
+	mov		ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	vmovdqu		xmm1, [T_val]                   ; read initial Tweak value
+	vpxor		xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]	; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]	; ciphertext pointer
+%endif
+
+	cmp		N_val, 128
+	jl              _less_than_128_bytes
+
+	vpbroadcastq	zpoly, ghash_poly_8b
+
+	cmp		N_val, 256
+	jge		_start_by16
+
+	cmp		N_val, 128
+	jge		_start_by8
+
+_do_n_blocks:
+	cmp		N_val, 0
+	je		_ret_
+
+	cmp		N_val, (7*16)
+	jge		_remaining_num_blocks_is_7
+
+	cmp		N_val, (6*16)
+	jge		_remaining_num_blocks_is_6
+
+	cmp		N_val, (5*16)
+	jge		_remaining_num_blocks_is_5
+
+	cmp		N_val, (4*16)
+	jge		_remaining_num_blocks_is_4
+
+	cmp		N_val, (3*16)
+	jge		_remaining_num_blocks_is_3
+
+	cmp		N_val, (2*16)
+	jge		_remaining_num_blocks_is_2
+
+	cmp		N_val, (1*16)
+	jge		_remaining_num_blocks_is_1
+
+;; _remaining_num_blocks_is_0:
+	vmovdqu		xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext - 16], xmm1
+	vmovdqa		xmm8, xmm1
+
+	; Calc previous tweak
+	mov		tmp1, 1
+	kmovq		k1, tmp1
+	vpsllq		xmm13, xmm9, 63
+	vpsraq		xmm14, xmm13, 63
+	vpandq		xmm5, xmm14, XWORD(zpoly)
+	vpxorq		xmm9 {k1}, xmm9, xmm5
+	vpsrldq		xmm10, xmm9, 8
+	vpshrdq		xmm0, xmm9, xmm10, 1
+	vpslldq		xmm13, xmm13, 8
+	vpxorq		xmm0, xmm0, xmm13
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_7:
+	mov		tmp1, -1
+	shr		tmp1, 16
+	kmovq		k1, tmp1
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2 {k1}, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*7
+	and		N_val, 15
+	je		_done_7_remain
+	vextracti32x4	xmm12, zmm10, 2
+	vextracti32x4	xmm13, zmm10, 3
+	vinserti32x4	zmm10, xmm13, 2
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4] {k1}, zmm2
+	add		ptr_ciphertext, 16*7
+	vextracti32x4	xmm8, zmm2, 0x2
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_7_remain:
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4] {k1}, zmm2
+	jmp		_ret_
+
+_remaining_num_blocks_is_6:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	ymm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*6
+	and		N_val, 15
+	je		_done_6_remain
+	vextracti32x4	xmm12, zmm10, 1
+	vextracti32x4	xmm13, zmm10, 2
+	vinserti32x4	zmm10, xmm13, 1
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], ymm2
+	add		ptr_ciphertext, 16*6
+	vextracti32x4	xmm8, zmm2, 0x1
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_6_remain:
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], ymm2
+	jmp		_ret_
+
+_remaining_num_blocks_is_5:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*5
+	and		N_val, 15
+	je		_done_5_remain
+	vmovdqa		xmm12, xmm10
+	vextracti32x4	xmm10, zmm10, 1
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu		[ptr_ciphertext+16*4], xmm2
+	add		ptr_ciphertext, 16*5
+	vmovdqa		xmm8, xmm2
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_5_remain:
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu		[ptr_ciphertext+16*4], xmm2
+	jmp		_ret_
+
+_remaining_num_blocks_is_4:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	add		ptr_plaintext, 16*4
+	and		N_val, 15
+	je		_done_4_remain
+	vextracti32x4	xmm12, zmm9, 3
+	vinserti32x4	zmm9, xmm10, 3
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	add		ptr_ciphertext, 16*4
+	vextracti32x4	xmm8, zmm1, 0x3
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_4_remain:
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	jmp		_ret_
+
+_remaining_num_blocks_is_3:
+	vmovdqu		xmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*1]
+	vmovdqu		xmm3, [ptr_plaintext+16*2]
+	add		ptr_plaintext, 16*3
+	and		N_val, 15
+	je		_done_3_remain
+	vextracti32x4	xmm13, zmm9, 2
+	vextracti32x4	xmm10, zmm9, 1
+	vextracti32x4	xmm11, zmm9, 3
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	add		ptr_ciphertext, 16*3
+	vmovdqa		xmm8, xmm3
+	vmovdqa		xmm0, xmm13
+	jmp		_steal_cipher
+_done_3_remain:
+	vextracti32x4	xmm10, zmm9, 1
+	vextracti32x4	xmm11, zmm9, 2
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	jmp		_ret_
+
+_remaining_num_blocks_is_2:
+	vmovdqu		xmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*1]
+	add		ptr_plaintext, 16*2
+	and		N_val, 15
+	je		_done_2_remain
+	vextracti32x4	xmm10, zmm9, 2
+	vextracti32x4	xmm12, zmm9, 1
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	add		ptr_ciphertext, 16*2
+	vmovdqa		xmm8, xmm2
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_2_remain:
+	vextracti32x4	xmm10, zmm9, 1
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	jmp		_ret_
+
+_remaining_num_blocks_is_1:
+	vmovdqu		xmm1, [ptr_plaintext]
+	add		ptr_plaintext, 16
+	and		N_val, 15
+	je		_done_1_remain
+	vextracti32x4	xmm11, zmm9, 1
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16
+	vmovdqa		xmm8, xmm1
+	vmovdqa		xmm0, xmm9
+	jmp		_steal_cipher
+_done_1_remain:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	jmp		_ret_
+
+
+
+_start_by16:
+	; Make first 7 tweek values
+	vbroadcasti32x4	zmm0, [TW]
+	vbroadcasti32x4	zmm8, [shufb_15_7]
+	mov		tmp1, 0xaa
+	kmovq		k2, tmp1
+
+	; Mult tweak by 2^{3, 2, 1, 0}
+	vpshufb		zmm1, zmm0, zmm8		; mov 15->0, 7->8
+	vpsllvq		zmm4, zmm0, [const_dq3210]	; shift l 3,2,1,0
+	vpsrlvq		zmm2, zmm1, [const_dq5678]	; shift r 5,6,7,8
+	vpclmulqdq      zmm3, zmm2, zpoly, 0x00
+	vpxorq		zmm4 {k2}, zmm4, zmm2		; tweaks shifted by 3-0
+	vpxord		zmm9, zmm3, zmm4
+
+	; Mult tweak by 2^{7, 6, 5, 4}
+	vpsllvq		zmm5, zmm0, [const_dq7654]	; shift l 7,6,5,4
+	vpsrlvq		zmm6, zmm1, [const_dq1234]	; shift r 1,2,3,4
+	vpclmulqdq      zmm7, zmm6, zpoly, 0x00
+	vpxorq		zmm5 {k2}, zmm5, zmm6		; tweaks shifted by 7-4
+	vpxord		zmm10, zmm7, zmm5
+
+	; Make next 8 tweek values by all x 2^8
+	vpsrldq		zmm13, zmm9, 15
+	vpclmulqdq	zmm14, zmm13, zpoly, 0
+	vpslldq		zmm11, zmm9, 1
+	vpxord		zmm11, zmm11, zmm14
+
+	vpsrldq		zmm15, zmm10, 15
+	vpclmulqdq	zmm16, zmm15, zpoly, 0
+	vpslldq		zmm12, zmm10, 1
+	vpxord		zmm12, zmm12, zmm16
+
+_main_loop_run_16:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2, [ptr_plaintext+16*4]
+	vmovdqu8	zmm3, [ptr_plaintext+16*8]
+	vmovdqu8	zmm4, [ptr_plaintext+16*12]
+	add		ptr_plaintext, 256
+
+	decrypt_by_16_zmm  zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
+
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], zmm2
+	vmovdqu8	[ptr_ciphertext+16*8], zmm3
+	vmovdqu8	[ptr_ciphertext+16*12], zmm4
+	add		ptr_ciphertext, 256
+	sub		N_val, 256
+	cmp		N_val, 256
+	jge		_main_loop_run_16
+
+	cmp		N_val, 128
+	jge		_main_loop_run_8
+
+	jmp		_do_n_blocks
+
+_start_by8:
+	; Make first 7 tweek values
+	vbroadcasti32x4	zmm0, [TW]
+	vbroadcasti32x4	zmm8, [shufb_15_7]
+	mov		tmp1, 0xaa
+	kmovq		k2, tmp1
+
+	; Mult tweak by 2^{3, 2, 1, 0}
+	vpshufb		zmm1, zmm0, zmm8		; mov 15->0, 7->8
+	vpsllvq		zmm4, zmm0, [const_dq3210]	; shift l 3,2,1,0
+	vpsrlvq		zmm2, zmm1, [const_dq5678]	; shift r 5,6,7,8
+	vpclmulqdq      zmm3, zmm2, zpoly, 0x00
+	vpxorq		zmm4 {k2}, zmm4, zmm2		; tweaks shifted by 3-0
+	vpxord		zmm9, zmm3, zmm4
+
+	; Mult tweak by 2^{7, 6, 5, 4}
+	vpsllvq		zmm5, zmm0, [const_dq7654]	; shift l 7,6,5,4
+	vpsrlvq		zmm6, zmm1, [const_dq1234]	; shift r 1,2,3,4
+	vpclmulqdq      zmm7, zmm6, zpoly, 0x00
+	vpxorq		zmm5 {k2}, zmm5, zmm6		; tweaks shifted by 7-4
+	vpxord		zmm10, zmm7, zmm5
+
+_main_loop_run_8:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 128
+
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 0
+
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], zmm2
+	add		ptr_ciphertext, 128
+	sub		N_val, 128
+	cmp		N_val, 128
+	jge		_main_loop_run_8
+
+	jmp		_do_n_blocks
+
+_steal_cipher:
+	; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
+	vmovdqa		xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea		twtempl, [vpshufb_shf_table]
+	vmovdqu		xmm10, [twtempl+N_val]
+	vpshufb		xmm8, xmm10
+
+	vmovdqu		xmm3, [ptr_plaintext - 16 + N_val]
+	vmovdqu		[ptr_ciphertext - 16 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea		twtempl, [vpshufb_shf_table +16]
+	sub		twtempl, N_val
+	vmovdqu		xmm10, [twtempl]
+	vpxor		xmm10, [mask1]
+	vpshufb		xmm3, xmm10
+
+	vpblendvb	xmm3, xmm3, xmm2, xmm10
+
+	; xor Tweak value
+	vpxor		xmm8, xmm3, xmm0
+
+	;decrypt last block with cipher stealing
+	vpxor		xmm8, [keys]		; ARK
+	vaesdec		xmm8, [keys + 16*1]	; round 1
+	vaesdec		xmm8, [keys + 16*2]	; round 2
+	vaesdec		xmm8, [keys + 16*3]	; round 3
+	vaesdec		xmm8, [keys + 16*4]	; round 4
+	vaesdec		xmm8, [keys + 16*5]	; round 5
+	vaesdec		xmm8, [keys + 16*6]	; round 6
+	vaesdec		xmm8, [keys + 16*7]	; round 7
+	vaesdec		xmm8, [keys + 16*8]	; round 8
+	vaesdec		xmm8, [keys + 16*9]	; round 9
+	vaesdeclast	xmm8, [keys + 16*10]	; round 10
+
+	; xor Tweak value
+	vpxor		xmm8, xmm8, xmm0
+
+_done:
+	; store last ciphertext value
+	vmovdqu		[ptr_ciphertext - 16], xmm8
+
+_ret_:
+	mov		rbx, [_gpr + 8*0]
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov		rdi, [_gpr + 8*1]
+	mov		rsi, [_gpr + 8*2]
+
+	vmovdqa		xmm6, [_xmm + 16*0]
+	vmovdqa		xmm7, [_xmm + 16*1]
+	vmovdqa		xmm8, [_xmm + 16*2]
+	vmovdqa		xmm9, [_xmm + 16*3]
+	vmovdqa		xmm10, [_xmm + 16*4]
+	vmovdqa		xmm11, [_xmm + 16*5]
+	vmovdqa		xmm12, [_xmm + 16*6]
+	vmovdqa		xmm13, [_xmm + 16*7]
+	vmovdqa		xmm14, [_xmm + 16*8]
+	vmovdqa		xmm15, [_xmm + 16*9]
+%endif
+
+%ifndef ALIGN_STACK
+	add		rsp, VARIABLE_OFFSET
+%else
+	mov		rsp, rbp
+	pop		rbp
+%endif
+	ret
+
+
+_less_than_128_bytes:
+	cmp		N_val, 16
+	jb		_ret_
+
+	mov		tmp1, N_val
+	and		tmp1, (7 << 4)
+	cmp		tmp1, (6 << 4)
+	je		_num_blocks_is_6
+	cmp		tmp1, (5 << 4)
+	je		_num_blocks_is_5
+	cmp		tmp1, (4 << 4)
+	je		_num_blocks_is_4
+	cmp		tmp1, (3 << 4)
+	je		_num_blocks_is_3
+	cmp		tmp1, (2 << 4)
+	je		_num_blocks_is_2
+	cmp		tmp1, (1 << 4)
+	je		_num_blocks_is_1
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add		ptr_plaintext, 16*7
+	and		N_val, 15
+	je		_done_7
+
+_steal_cipher_7:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa64	xmm16, xmm15
+	vmovdqa		xmm15, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+	vmovdqu		[ptr_ciphertext+16*5], xmm6
+	add		ptr_ciphertext, 16*7
+	vmovdqa64	xmm0, xmm16
+	vmovdqa		xmm8, xmm7
+	jmp		_steal_cipher
+
+_done_7:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+	vmovdqu		[ptr_ciphertext+16*5], xmm6
+	add		ptr_ciphertext, 16*7
+	vmovdqa		xmm8, xmm7
+	jmp		_done
+
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add		ptr_plaintext, 16*6
+	and		N_val, 15
+	je		 _done_6
+
+_steal_cipher_6:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm15, xmm14
+	vmovdqa		xmm14, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	add		ptr_ciphertext, 16*6
+	vmovdqa		xmm0, xmm15
+	vmovdqa		xmm8, xmm6
+	jmp		_steal_cipher
+
+_done_6:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	add		ptr_ciphertext, 16*6
+	vmovdqa		xmm8, xmm6
+	jmp		_done
+
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add		ptr_plaintext, 16*5
+	and		N_val, 15
+	je		_done_5
+
+_steal_cipher_5:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm14, xmm13
+	vmovdqa		xmm13, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	add		ptr_ciphertext, 16*5
+	vmovdqa		xmm0, xmm14
+	vmovdqa		xmm8, xmm5
+	jmp		_steal_cipher
+
+_done_5:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	add		ptr_ciphertext, 16*5
+	vmovdqa		xmm8, xmm5
+	jmp		_done
+
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add		ptr_plaintext, 16*4
+	and		N_val, 15
+	je		_done_4
+
+_steal_cipher_4:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm13, xmm12
+	vmovdqa		xmm12, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	add		ptr_ciphertext, 16*4
+	vmovdqa		xmm0, xmm13
+	vmovdqa		xmm8, xmm4
+	jmp		_steal_cipher
+
+_done_4:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	add		ptr_ciphertext, 16*4
+	vmovdqa		xmm8, xmm4
+	jmp		_done
+
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add		ptr_plaintext, 16*3
+	and		N_val, 15
+	je		_done_3
+
+_steal_cipher_3:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm12, xmm11
+	vmovdqa		xmm11, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	add		ptr_ciphertext, 16*3
+	vmovdqa		xmm0, xmm12
+	vmovdqa		xmm8, xmm3
+	jmp		_steal_cipher
+
+_done_3:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	add		ptr_ciphertext, 16*3
+	vmovdqa		xmm8, xmm3
+	jmp		_done
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add		ptr_plaintext, 16*2
+	and		N_val, 15
+	je		_done_2
+
+_steal_cipher_2:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm11, xmm10
+	vmovdqa		xmm10, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16*2
+	vmovdqa		xmm0, xmm11
+	vmovdqa		xmm8, xmm2
+	jmp		_steal_cipher
+
+_done_2:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16*2
+	vmovdqa		xmm8, xmm2
+	jmp		_done
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add		ptr_plaintext, 16*1
+	and		N_val, 15
+	je		_done_1
+
+_steal_cipher_1:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm10, xmm9
+	vmovdqa		xmm9, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	add		ptr_ciphertext, 16*1
+	vmovdqa		xmm0, xmm10
+	vmovdqa		xmm8, xmm1
+	jmp		_steal_cipher
+
+_done_1:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	add		ptr_ciphertext, 16*1
+	vmovdqa		xmm8, xmm1
+	jmp		_done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
+const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
+const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
+const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
+
+shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+%else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_XTS_AES_128_dec_expanded_key_vaes
+no_XTS_AES_128_dec_expanded_key_vaes:
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm
new file mode 100644
index 000000000..19f887c2f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm
@@ -0,0 +1,1779 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 128-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*19     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*19     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*29     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_dec_sse(
+;               UINT8 *k2,      // key used for tweaking, 16*1 bytes
+;               UINT8 *k1,      // key used for "ECB" decryption, 16*1 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *ct,        // ciphertext sector input data
+;               UINT8 *pt);     // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define target_ptr_val          rsi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define target_ptr_val          rdx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+
+; produce the key for the next round
+; raw_key is the output of aeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro  key_expansion_128       3
+%define %%xraw_key      %1
+%define %%xtmp  %2
+%define %%xround_key    %3
+	pshufd  %%xraw_key,  %%xraw_key, 11111111b
+	shufps  %%xtmp, %%xround_key, 00010000b
+	pxor    %%xround_key, %%xtmp
+	shufps  %%xtmp, %%xround_key, 10001100b
+	pxor    %%xround_key, %%xtmp
+	pxor    %%xround_key,  %%xraw_key
+%endmacro
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 9
+%define %%xkey2 %1
+%define %%xstate_tweak  %2
+%define %%xkey1 %3
+%define %%xraw_key      %4
+%define %%xtmp  %5
+%define %%xtmp2 %6
+%define %%ptr_key2      %7
+%define %%ptr_key1      %8
+%define %%ptr_expanded_keys     %9
+
+
+	movdqu  %%xkey2, [%%ptr_key2]
+	movdqu  %%xkey1, [%%ptr_key1]
+	movdqa  [%%ptr_expanded_keys+16*10], %%xkey1
+
+	pxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x1        ; Generating round key 1 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x1        ; Generating round key 1 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 1 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1
+	movdqa                  [%%ptr_expanded_keys + 16*9], %%xtmp2
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x2        ; Generating round key 2 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x2        ; Generating round key 2 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 2 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1
+	movdqa                  [%%ptr_expanded_keys + 16*8], %%xtmp2
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x4        ; Generating round key 3 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x4        ; Generating round key 3 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 3 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1
+	movdqa                  [%%ptr_expanded_keys + 16*7], %%xtmp2
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x8        ; Generating round key 4 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x8        ; Generating round key 4 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 4 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1
+	movdqa                  [%%ptr_expanded_keys + 16*6], %%xtmp2
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x10       ; Generating round key 5 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x10       ; Generating round key 5 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 5 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1
+	movdqa                  [%%ptr_expanded_keys + 16*5], %%xtmp2
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x20       ; Generating round key 6 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x20       ; Generating round key 6 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 6 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1
+	movdqa                  [%%ptr_expanded_keys + 16*4], %%xtmp2
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x40       ; Generating round key 7 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x40       ; Generating round key 7 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 7 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1
+	movdqa                  [%%ptr_expanded_keys + 16*3], %%xtmp2
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x80       ; Generating round key 8 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x80       ; Generating round key 8 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 8 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1
+	movdqa                  [%%ptr_expanded_keys + 16*2], %%xtmp2
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x1b       ; Generating round key 9 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x1b       ; Generating round key 9 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 9 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1
+	movdqa                  [%%ptr_expanded_keys + 16*1], %%xtmp2
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x36       ; Generating round key 10 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x36       ; Generating round key 10 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	aesenclast              %%xstate_tweak, %%xkey2         ; round 10 for tweak encryption
+	movdqa                  [%%ptr_expanded_keys + 16*0], %%xkey1
+
+	movdqa  [TW], %%xstate_tweak            ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		movdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		movdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		movdqa  %%TW2, [TW+16*1]
+		movdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		movdqa  %%TW3, [TW+16*2]
+		movdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		movdqa  %%TW4, [TW+16*3]
+		movdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		movdqa  %%TW5, [TW+16*4]
+		movdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		movdqa  %%TW6, [TW+16*5]
+		movdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		movdqa  %%TW7, [TW+16*6]
+		movdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; decrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
+; next 8 Tweak values are generated
+%macro  decrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks decrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	pxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	movdqa  %%T0, [keys]
+	pxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	movdqa  %%T0, [keys + 16*1]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	movdqa  %%T0, [keys + 16*2]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	movdqa  %%T0, [keys + 16*3]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	movdqa  %%T0, [keys + 16*4]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	movdqa  %%T0, [keys + 16*5]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	movdqa  %%T0, [keys + 16*6]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	movdqa  %%T0, [keys + 16*7]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	movdqa  %%T0, [keys + 16*8]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	movdqa  %%T0, [keys + 16*9]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+
+
+	; round 10
+	movdqa  %%T0, [keys + 16*10]
+	aesdeclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdeclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdeclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdeclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdeclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdeclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdeclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		movdqa  %%TW1, [TW + 16*0]
+		movdqa  %%TW2, [TW + 16*1]
+		movdqa  %%TW3, [TW + 16*2]
+		movdqa  %%TW4, [TW + 16*3]
+		movdqa  %%TW5, [TW + 16*4]
+		movdqa  %%TW6, [TW + 16*5]
+		movdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; decrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  decrypt_by_eight 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%TW8   %16     ; tweak 8
+%define %%T0    %17     ; Temp register
+%define %%last_eight     %18
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+	pxor    %%ST2, %%TW2
+	pxor    %%ST3, %%TW3
+	pxor    %%ST4, %%TW4
+	pxor    %%ST5, %%TW5
+	pxor    %%ST6, %%TW6
+	pxor    %%ST7, %%TW7
+	pxor    %%ST8, %%TW8
+
+	; ARK
+	movdqa %%T0, [keys]
+	pxor    %%ST1, %%T0
+	pxor    %%ST2, %%T0
+	pxor    %%ST3, %%T0
+	pxor    %%ST4, %%T0
+	pxor    %%ST5, %%T0
+	pxor    %%ST6, %%T0
+	pxor    %%ST7, %%T0
+	pxor    %%ST8, %%T0
+
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 1
+	movdqa %%T0, [keys + 16*1]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 2
+	movdqa %%T0, [keys + 16*2]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+
+%endif
+	; round 3
+	movdqa %%T0, [keys + 16*3]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*2], twtempl
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 4
+	movdqa %%T0, [keys + 16*4]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl
+%endif
+	; round 5
+	movdqa %%T0, [keys + 16*5]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 6
+	movdqa %%T0, [keys + 16*6]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl
+		mov     [TW + 8*7], twtemph
+%endif
+	; round 7
+	movdqa %%T0, [keys + 16*7]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 8
+	movdqa %%T0, [keys + 16*8]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl
+		mov     [TW + 8*9], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 9
+	movdqa %%T0, [keys + 16*9]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+		mov     [TW + 8*10], twtempl
+		mov     [TW + 8*11], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+		mov     [TW + 8*13], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+;               mov     [TW + 8*14], twtempl
+;               mov     [TW + 8*15], twtemph
+%endif
+	; round 10
+	movdqa %%T0, [keys + 16*10]
+	aesdeclast      %%ST1, %%T0
+	aesdeclast      %%ST2, %%T0
+	aesdeclast      %%ST3, %%T0
+	aesdeclast      %%ST4, %%T0
+	aesdeclast      %%ST5, %%T0
+	aesdeclast      %%ST6, %%T0
+	aesdeclast      %%ST7, %%T0
+	aesdeclast      %%ST8, %%T0
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+	pxor    %%ST2, %%TW2
+	pxor    %%ST3, %%TW3
+	pxor    %%ST4, %%TW4
+	pxor    %%ST5, %%TW5
+	pxor    %%ST6, %%TW6
+	pxor    %%ST7, %%TW7
+	pxor    %%ST8, %%TW8
+
+		mov     [TW + 8*14], twtempl
+		mov     [TW + 8*15], twtemph
+		; load next Tweak values
+		movdqa  %%TW1, [TW + 16*0]
+		movdqa  %%TW2, [TW + 16*1]
+		movdqa  %%TW3, [TW + 16*2]
+		movdqa  %%TW4, [TW + 16*3]
+		movdqa  %%TW5, [TW + 16*4]
+		movdqa  %%TW6, [TW + 16*5]
+		movdqa  %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_dec_sse, function
+XTS_AES_128_dec_sse:
+	endbranch
+
+	sub     rsp, VARIABLE_OFFSET
+
+	mov     [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [_gpr + 8*1], rdi
+	mov     [_gpr + 8*2], rsi
+
+	movdqa  [_xmm + 16*0], xmm6
+	movdqa  [_xmm + 16*1], xmm7
+	movdqa  [_xmm + 16*2], xmm8
+	movdqa  [_xmm + 16*3], xmm9
+	movdqa  [_xmm + 16*4], xmm10
+	movdqa  [_xmm + 16*5], xmm11
+	movdqa  [_xmm + 16*6], xmm12
+	movdqa  [_xmm + 16*7], xmm13
+	movdqa  [_xmm + 16*8], xmm14
+	movdqa  [_xmm + 16*9], xmm15
+%endif
+
+	mov     ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	movdqu  xmm1, [T_val]                   ; read initial Tweak value
+	pxor    xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]            ; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]           ; ciphertext pointer
+%endif
+
+
+
+	mov             target_ptr_val, N_val
+	and             target_ptr_val, -16             ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+	sub             target_ptr_val, 128             ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+	jl              _less_than_128_bytes
+
+	add             target_ptr_val, ptr_ciphertext
+
+
+	mov             tmp1, N_val
+	and             tmp1, (7 << 4)
+	jz              _initial_num_blocks_is_0
+
+	cmp             tmp1, (4 << 4)
+	je              _initial_num_blocks_is_4
+
+
+
+	cmp             tmp1, (6 << 4)
+	je              _initial_num_blocks_is_6
+
+	cmp             tmp1, (5 << 4)
+	je              _initial_num_blocks_is_5
+
+
+
+	cmp             tmp1, (3 << 4)
+	je              _initial_num_blocks_is_3
+
+	cmp             tmp1, (2 << 4)
+	je              _initial_num_blocks_is_2
+
+	cmp             tmp1, (1 << 4)
+	je              _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add     ptr_plaintext, 16*7
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	add     ptr_ciphertext, 16*7
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add     ptr_plaintext, 16*6
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	add     ptr_ciphertext, 16*6
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add     ptr_plaintext, 16*5
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	add     ptr_ciphertext, 16*5
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add     ptr_plaintext, 16*4
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	add     ptr_ciphertext, 16*4
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+
+_initial_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add     ptr_plaintext, 16*3
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	add     ptr_ciphertext, 16*3
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add     ptr_plaintext, 16*2
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+	movdqu  [ptr_ciphertext+16], xmm2
+	add     ptr_ciphertext, 16*2
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add     ptr_plaintext, 16*1
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+	add     ptr_ciphertext, 16
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_0:
+		mov             twtempl, [TW+8*0]
+		mov             twtemph, [TW+8*1]
+		movdqa          xmm9, [TW+16*0]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*2], twtempl
+		mov             [TW+8*3], twtemph
+		movdqa          xmm10, [TW+16*1]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*4], twtempl
+		mov             [TW+8*5], twtemph
+		movdqa          xmm11, [TW+16*2]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*6], twtempl
+		mov             [TW+8*7], twtemph
+		movdqa          xmm12, [TW+16*3]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*8], twtempl
+		mov             [TW+8*9], twtemph
+		movdqa          xmm13, [TW+16*4]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*10], twtempl
+		mov             [TW+8*11], twtemph
+		movdqa          xmm14, [TW+16*5]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*12], twtempl
+		mov             [TW+8*13], twtemph
+		movdqa          xmm15, [TW+16*6]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*14], twtempl
+		mov             [TW+8*15], twtemph
+		;movdqa         xmm16, [TW+16*7]
+
+		cmp             ptr_ciphertext, target_ptr_val
+		je              _last_eight
+_main_loop:
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+
+	add     ptr_plaintext, 128
+
+	decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	movdqu  [ptr_ciphertext+16*7], xmm8
+	add     ptr_ciphertext, 128
+
+	cmp     ptr_ciphertext, target_ptr_val
+	jne     _main_loop
+
+_last_eight:
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_final
+
+	; generate next Tweak value
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	movdqa  xmm1, [TW + 16*7]
+	movdqa  [TW + 16*0], xmm1       ; swap tweak values for cipher stealing for decrypt
+
+	mov     [TW + 16*7], twtempl
+	mov     [TW + 16*7+8], twtemph
+
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+	decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	jmp     _steal_cipher
+
+
+_done_final:
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+	decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+
+	jmp      _done
+
+
+_steal_cipher:
+	; start cipher stealing
+
+	movdqa  xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea     twtempl, [pshufb_shf_table]
+	movdqu  xmm0, [twtempl+N_val]
+	pshufb  xmm8, xmm0
+
+
+	movdqu  xmm3, [ptr_plaintext + 112 + N_val]      ; state register is temporarily xmm3 to eliminate a move
+	movdqu  [ptr_ciphertext + 112 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea     twtempl, [pshufb_shf_table +16]
+	sub     twtempl, N_val
+	movdqu  xmm0, [twtempl]
+	pxor    xmm0, [mask1]
+	pshufb  xmm3, xmm0
+
+	pblendvb        xmm3, xmm2      ;xmm0 is implicit
+
+	; xor Tweak value
+	movdqa  xmm8, [TW]
+	pxor    xmm8, xmm3      ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+	;decrypt last block with cipher stealing
+	pxor    xmm8, [keys]                    ; ARK
+	aesdec  xmm8, [keys + 16*1]             ; round 1
+	aesdec  xmm8, [keys + 16*2]             ; round 2
+	aesdec  xmm8, [keys + 16*3]             ; round 3
+	aesdec  xmm8, [keys + 16*4]             ; round 4
+	aesdec  xmm8, [keys + 16*5]             ; round 5
+	aesdec  xmm8, [keys + 16*6]             ; round 6
+	aesdec  xmm8, [keys + 16*7]             ; round 7
+	aesdec  xmm8, [keys + 16*8]             ; round 8
+	aesdec  xmm8, [keys + 16*9]             ; round 9
+	aesdeclast      xmm8, [keys + 16*10]    ; round 10
+
+	; xor Tweak value
+	pxor    xmm8, [TW]
+
+_done:
+	; store last ciphertext value
+	movdqu  [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+	mov     rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rdi, [_gpr + 8*1]
+	mov     rsi, [_gpr + 8*2]
+
+
+	movdqa  xmm6, [_xmm + 16*0]
+	movdqa  xmm7, [_xmm + 16*1]
+	movdqa  xmm8, [_xmm + 16*2]
+	movdqa  xmm9, [_xmm + 16*3]
+	movdqa  xmm10, [_xmm + 16*4]
+	movdqa  xmm11, [_xmm + 16*5]
+	movdqa  xmm12, [_xmm + 16*6]
+	movdqa  xmm13, [_xmm + 16*7]
+	movdqa  xmm14, [_xmm + 16*8]
+	movdqa  xmm15, [_xmm + 16*9]
+%endif
+
+	add     rsp, VARIABLE_OFFSET
+
+	ret
+
+
+
+
+
+_less_than_128_bytes:
+	cmp N_val, 16
+	jb _ret_
+
+	mov     tmp1, N_val
+	and     tmp1, (7 << 4)
+	cmp     tmp1, (6 << 4)
+	je      _num_blocks_is_6
+	cmp     tmp1, (5 << 4)
+	je      _num_blocks_is_5
+	cmp     tmp1, (4 << 4)
+	je      _num_blocks_is_4
+	cmp     tmp1, (3 << 4)
+	je      _num_blocks_is_3
+	cmp     tmp1, (2 << 4)
+	je      _num_blocks_is_2
+	cmp     tmp1, (1 << 4)
+	je      _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+	sub     ptr_plaintext, 16*1
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_7
+
+_steal_cipher_7:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm15
+	movdqa  xmm15, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	movdqa  xmm8, xmm7
+	jmp     _steal_cipher
+
+_done_7:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	movdqa  xmm8, xmm7
+	jmp     _done
+
+
+
+
+
+
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+	sub     ptr_plaintext, 16*2
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_6
+
+_steal_cipher_6:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm14
+	movdqa  xmm14, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	movdqa  xmm8, xmm6
+	jmp     _steal_cipher
+
+_done_6:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	movdqa  xmm8, xmm6
+	jmp     _done
+
+
+
+
+
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+	sub     ptr_plaintext, 16*3
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_5
+
+_steal_cipher_5:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm13
+	movdqa  xmm13, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	movdqa  xmm8, xmm5
+	jmp     _steal_cipher
+
+_done_5:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	movdqa  xmm8, xmm5
+	jmp     _done
+
+
+
+
+
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+	sub     ptr_plaintext, 16*4
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_4
+
+_steal_cipher_4:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm12
+	movdqa  xmm12, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	movdqa  xmm8, xmm4
+	jmp     _steal_cipher
+
+_done_4:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	movdqa  xmm8, xmm4
+	jmp     _done
+
+
+
+
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+	sub     ptr_plaintext, 16*5
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_3
+
+_steal_cipher_3:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm11
+	movdqa  xmm11, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	movdqa  xmm8, xmm3
+	jmp     _steal_cipher
+
+_done_3:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	movdqa  xmm8, xmm3
+	jmp     _done
+
+
+
+
+
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+	sub     ptr_plaintext, 16*6
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_2
+
+_steal_cipher_2:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm10
+	movdqa  xmm10, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	movdqa  xmm8, xmm2
+	jmp     _steal_cipher
+
+_done_2:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	movdqa  xmm8, xmm2
+	jmp     _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+	sub     ptr_plaintext, 16*7
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_1
+
+_steal_cipher_1:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm9
+	movdqa  xmm9, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	movdqa  xmm8, xmm1
+	jmp     _steal_cipher
+
+_done_1:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	movdqa  xmm8, xmm1
+	jmp     _done
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_vaes.asm
new file mode 100644
index 000000000..e3435dd83
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_vaes.asm
@@ -0,0 +1,1681 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*23     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*23     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*33     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_vavx(
+;               UINT8 *k2,      // key used for tweaking, 16*2 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*2 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *pt,        // plaintext sector input data
+;               UINT8 *ct);     // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+%define zpoly   zmm25
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro	key_expansion_128	3
+%define	%%xraw_key	%1
+%define	%%xtmp	%2
+%define	%%xround_key	%3
+	vpshufd	%%xraw_key,  %%xraw_key, 11111111b
+	vshufps	%%xtmp, %%xround_key, 00010000b
+	vpxor	%%xround_key, %%xtmp
+	vshufps	%%xtmp, %%xround_key, 10001100b
+	vpxor	%%xround_key, %%xtmp
+	vpxor	%%xround_key,  %%xraw_key
+%endmacro
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 9
+%define %%xkey2 %1
+%define %%xstate_tweak  %2
+%define %%xkey1 %3
+%define %%xraw_key      %4
+%define %%xtmp  %5
+%define %%xtmp2 %6
+%define %%ptr_key2      %7
+%define %%ptr_key1      %8
+%define %%ptr_expanded_keys     %9
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2]
+	vmovdqu  %%xkey1, [%%ptr_key1]
+	vmovdqa  [%%ptr_expanded_keys+16*10], %%xkey1
+
+	vpxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x1        ; Generating round key 1 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x1        ; Generating round key 1 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 1 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys + 16*9], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x2        ; Generating round key 2 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x2        ; Generating round key 2 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 2 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys + 16*8], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x4        ; Generating round key 3 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x4        ; Generating round key 3 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 3 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys + 16*7], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x8        ; Generating round key 4 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x8        ; Generating round key 4 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 4 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys + 16*6], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x10       ; Generating round key 5 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x10       ; Generating round key 5 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 5 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys + 16*5], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x20       ; Generating round key 6 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x20       ; Generating round key 6 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 6 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys + 16*4], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x40       ; Generating round key 7 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x40       ; Generating round key 7 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 7 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys + 16*3], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x80       ; Generating round key 8 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x80       ; Generating round key 8 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 8 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys + 16*2], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x1b       ; Generating round key 9 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x1b       ; Generating round key 9 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 9 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys + 16*1], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x36       ; Generating round key 10 for key2
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x36       ; Generating round key 10 for key1
+	key_expansion_128       %%xraw_key, %%xtmp, %%xkey1
+	vaesenclast              %%xstate_tweak, %%xkey2         ; round 10 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys + 16*0], %%xkey1
+
+	vmovdqa  [TW], %%xstate_tweak            ; Store the encrypted Tweak value
+%endmacro
+
+
+; Original way to generate initial tweak values and load plaintext values
+; only used for small blocks
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		vmovdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		vmovdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		vmovdqa  %%TW2, [TW+16*1]
+		vmovdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		vmovdqa  %%TW3, [TW+16*2]
+		vmovdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		vmovdqa  %%TW4, [TW+16*3]
+		vmovdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		vmovdqa  %%TW5, [TW+16*4]
+		vmovdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		vmovdqa  %%TW6, [TW+16*5]
+		vmovdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		vmovdqa  %%TW7, [TW+16*6]
+		vmovdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+%endmacro
+
+
+; Original decrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
+; next 8 Tweak values can be generated
+%macro  decrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks decrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	vmovdqa  %%T0, [keys]
+	vpxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	vmovdqa  %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	vmovdqa  %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	vmovdqa  %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	vmovdqa  %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	vmovdqa  %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	vmovdqa  %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	vmovdqa  %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	vmovdqa  %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	vmovdqa  %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+
+	; round 10
+	vmovdqa  %%T0, [keys + 16*10]
+	vaesdeclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdeclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdeclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdeclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdeclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdeclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdeclast      %%ST7, %%T0
+%endif
+
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+
+; Decrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  decrypt_by_eight_zmm 6
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%TW1   %3      ; tweak 1
+%define %%TW2   %4      ; tweak 2
+%define %%T0    %5     ; Temp register
+%define %%last_eight     %6
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+
+	; ARK
+	vbroadcasti32x4 %%T0, [keys]
+	vpxorq    %%ST1, %%T0
+	vpxorq    %%ST2, %%T0
+
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW1, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm15, %%TW1, 1
+		vpxord		zmm15, zmm15, zmm14
+%endif
+	; round 1
+	vbroadcasti32x4 %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 2
+	vbroadcasti32x4 %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 3
+	vbroadcasti32x4 %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW2, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm16, %%TW2, 1
+		vpxord		zmm16, zmm16, zmm14
+%endif
+	; round 4
+	vbroadcasti32x4 %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 5
+	vbroadcasti32x4 %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 6
+	vbroadcasti32x4 %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 7
+	vbroadcasti32x4 %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 8
+	vbroadcasti32x4 %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 9
+	vbroadcasti32x4 %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 10
+	vbroadcasti32x4 %%T0, [keys + 16*10]
+	vaesdeclast  %%ST1, %%T0
+	vaesdeclast  %%ST2, %%T0
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+
+	; load next Tweak values
+	vmovdqa32  %%TW1, zmm15
+	vmovdqa32  %%TW2, zmm16
+%endmacro
+
+
+; Decrypt 16 blocks in parallel
+; generate next 8 tweak values
+%macro  decrypt_by_16_zmm 10
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+
+%define %%TW1   %5      ; tweak 1
+%define %%TW2   %6      ; tweak 2
+%define %%TW3   %7      ; tweak 3
+%define %%TW4   %8      ; tweak 4
+
+%define %%T0    %9     ; Temp register
+%define %%last_eight     %10
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+	vpxorq    %%ST3, %%TW3
+	vpxorq    %%ST4, %%TW4
+
+	; ARK
+	vbroadcasti32x4 %%T0, [keys]
+	vpxorq    %%ST1, %%T0
+	vpxorq    %%ST2, %%T0
+	vpxorq    %%ST3, %%T0
+	vpxorq    %%ST4, %%T0
+
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW3, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm15, %%TW3, 1
+		vpxord		zmm15, zmm15, zmm14
+%endif
+	; round 1
+	vbroadcasti32x4 %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 2
+	vbroadcasti32x4 %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 3
+	vbroadcasti32x4 %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW4, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm16, %%TW4, 1
+		vpxord		zmm16, zmm16, zmm14
+%endif
+	; round 4
+	vbroadcasti32x4 %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 5
+	vbroadcasti32x4 %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 6
+	vbroadcasti32x4 %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, zmm15, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm17, zmm15, 1
+		vpxord		zmm17, zmm17, zmm14
+%endif
+	; round 7
+	vbroadcasti32x4 %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 8
+	vbroadcasti32x4 %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 9
+	vbroadcasti32x4 %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, zmm16, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm18, zmm16, 1
+		vpxord		zmm18, zmm18, zmm14
+%endif
+	; round 10
+	vbroadcasti32x4 %%T0, [keys + 16*10]
+	vaesdeclast  %%ST1, %%T0
+	vaesdeclast  %%ST2, %%T0
+	vaesdeclast  %%ST3, %%T0
+	vaesdeclast  %%ST4, %%T0
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+	vpxorq    %%ST3, %%TW3
+	vpxorq    %%ST4, %%TW4
+
+	; load next Tweak values
+	vmovdqa32  %%TW1, zmm15
+	vmovdqa32  %%TW2, zmm16
+	vmovdqa32  %%TW3, zmm17
+	vmovdqa32  %%TW4, zmm18
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_dec_vaes, function
+XTS_AES_128_dec_vaes:
+	endbranch
+
+%define ALIGN_STACK
+%ifdef ALIGN_STACK
+	push		rbp
+	mov		rbp, rsp
+	sub		rsp, VARIABLE_OFFSET
+	and		rsp, ~63
+%else
+	sub		rsp, VARIABLE_OFFSET
+%endif
+
+	mov		[_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov		[_gpr + 8*1], rdi
+	mov		[_gpr + 8*2], rsi
+
+	vmovdqa		[_xmm + 16*0], xmm6
+	vmovdqa		[_xmm + 16*1], xmm7
+	vmovdqa		[_xmm + 16*2], xmm8
+	vmovdqa		[_xmm + 16*3], xmm9
+	vmovdqa		[_xmm + 16*4], xmm10
+	vmovdqa		[_xmm + 16*5], xmm11
+	vmovdqa		[_xmm + 16*6], xmm12
+	vmovdqa		[_xmm + 16*7], xmm13
+	vmovdqa		[_xmm + 16*8], xmm14
+	vmovdqa		[_xmm + 16*9], xmm15
+%endif
+
+	mov		ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	vmovdqu		xmm1, [T_val]                   ; read initial Tweak value
+	vpxor		xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]	; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]	; ciphertext pointer
+%endif
+
+	cmp		N_val, 128
+	jl              _less_than_128_bytes
+
+	vpbroadcastq	zpoly, ghash_poly_8b
+
+	cmp		N_val, 256
+	jge		_start_by16
+
+	cmp		N_val, 128
+	jge		_start_by8
+
+_do_n_blocks:
+	cmp		N_val, 0
+	je		_ret_
+
+	cmp		N_val, (7*16)
+	jge		_remaining_num_blocks_is_7
+
+	cmp		N_val, (6*16)
+	jge		_remaining_num_blocks_is_6
+
+	cmp		N_val, (5*16)
+	jge		_remaining_num_blocks_is_5
+
+	cmp		N_val, (4*16)
+	jge		_remaining_num_blocks_is_4
+
+	cmp		N_val, (3*16)
+	jge		_remaining_num_blocks_is_3
+
+	cmp		N_val, (2*16)
+	jge		_remaining_num_blocks_is_2
+
+	cmp		N_val, (1*16)
+	jge		_remaining_num_blocks_is_1
+
+;; _remaining_num_blocks_is_0:
+	vmovdqu		xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext - 16], xmm1
+	vmovdqa		xmm8, xmm1
+
+	; Calc previous tweak
+	mov		tmp1, 1
+	kmovq		k1, tmp1
+	vpsllq		xmm13, xmm9, 63
+	vpsraq		xmm14, xmm13, 63
+	vpandq		xmm5, xmm14, XWORD(zpoly)
+	vpxorq		xmm9 {k1}, xmm9, xmm5
+	vpsrldq		xmm10, xmm9, 8
+	vpshrdq		xmm0, xmm9, xmm10, 1
+	vpslldq		xmm13, xmm13, 8
+	vpxorq		xmm0, xmm0, xmm13
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_7:
+	mov		tmp1, -1
+	shr		tmp1, 16
+	kmovq		k1, tmp1
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2 {k1}, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*7
+	and		N_val, 15
+	je		_done_7_remain
+	vextracti32x4	xmm12, zmm10, 2
+	vextracti32x4	xmm13, zmm10, 3
+	vinserti32x4	zmm10, xmm13, 2
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4] {k1}, zmm2
+	add		ptr_ciphertext, 16*7
+	vextracti32x4	xmm8, zmm2, 0x2
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_7_remain:
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4] {k1}, zmm2
+	jmp		_ret_
+
+_remaining_num_blocks_is_6:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	ymm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*6
+	and		N_val, 15
+	je		_done_6_remain
+	vextracti32x4	xmm12, zmm10, 1
+	vextracti32x4	xmm13, zmm10, 2
+	vinserti32x4	zmm10, xmm13, 1
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], ymm2
+	add		ptr_ciphertext, 16*6
+	vextracti32x4	xmm8, zmm2, 0x1
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_6_remain:
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], ymm2
+	jmp		_ret_
+
+_remaining_num_blocks_is_5:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*5
+	and		N_val, 15
+	je		_done_5_remain
+	vmovdqa		xmm12, xmm10
+	vextracti32x4	xmm10, zmm10, 1
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu		[ptr_ciphertext+16*4], xmm2
+	add		ptr_ciphertext, 16*5
+	vmovdqa		xmm8, xmm2
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_5_remain:
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu		[ptr_ciphertext+16*4], xmm2
+	jmp		_ret_
+
+_remaining_num_blocks_is_4:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	add		ptr_plaintext, 16*4
+	and		N_val, 15
+	je		_done_4_remain
+	vextracti32x4	xmm12, zmm9, 3
+	vinserti32x4	zmm9, xmm10, 3
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	add		ptr_ciphertext, 16*4
+	vextracti32x4	xmm8, zmm1, 0x3
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_4_remain:
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	jmp		_ret_
+
+_remaining_num_blocks_is_3:
+	vmovdqu		xmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*1]
+	vmovdqu		xmm3, [ptr_plaintext+16*2]
+	add		ptr_plaintext, 16*3
+	and		N_val, 15
+	je		_done_3_remain
+	vextracti32x4	xmm13, zmm9, 2
+	vextracti32x4	xmm10, zmm9, 1
+	vextracti32x4	xmm11, zmm9, 3
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	add		ptr_ciphertext, 16*3
+	vmovdqa		xmm8, xmm3
+	vmovdqa		xmm0, xmm13
+	jmp		_steal_cipher
+_done_3_remain:
+	vextracti32x4	xmm10, zmm9, 1
+	vextracti32x4	xmm11, zmm9, 2
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	jmp		_ret_
+
+_remaining_num_blocks_is_2:
+	vmovdqu		xmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*1]
+	add		ptr_plaintext, 16*2
+	and		N_val, 15
+	je		_done_2_remain
+	vextracti32x4	xmm10, zmm9, 2
+	vextracti32x4	xmm12, zmm9, 1
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	add		ptr_ciphertext, 16*2
+	vmovdqa		xmm8, xmm2
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_2_remain:
+	vextracti32x4	xmm10, zmm9, 1
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	jmp		_ret_
+
+_remaining_num_blocks_is_1:
+	vmovdqu		xmm1, [ptr_plaintext]
+	add		ptr_plaintext, 16
+	and		N_val, 15
+	je		_done_1_remain
+	vextracti32x4	xmm11, zmm9, 1
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16
+	vmovdqa		xmm8, xmm1
+	vmovdqa		xmm0, xmm9
+	jmp		_steal_cipher
+_done_1_remain:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	jmp		_ret_
+
+
+
+_start_by16:
+	; Make first 7 tweek values
+	vbroadcasti32x4	zmm0, [TW]
+	vbroadcasti32x4	zmm8, [shufb_15_7]
+	mov		tmp1, 0xaa
+	kmovq		k2, tmp1
+
+	; Mult tweak by 2^{3, 2, 1, 0}
+	vpshufb		zmm1, zmm0, zmm8		; mov 15->0, 7->8
+	vpsllvq		zmm4, zmm0, [const_dq3210]	; shift l 3,2,1,0
+	vpsrlvq		zmm2, zmm1, [const_dq5678]	; shift r 5,6,7,8
+	vpclmulqdq      zmm3, zmm2, zpoly, 0x00
+	vpxorq		zmm4 {k2}, zmm4, zmm2		; tweaks shifted by 3-0
+	vpxord		zmm9, zmm3, zmm4
+
+	; Mult tweak by 2^{7, 6, 5, 4}
+	vpsllvq		zmm5, zmm0, [const_dq7654]	; shift l 7,6,5,4
+	vpsrlvq		zmm6, zmm1, [const_dq1234]	; shift r 1,2,3,4
+	vpclmulqdq      zmm7, zmm6, zpoly, 0x00
+	vpxorq		zmm5 {k2}, zmm5, zmm6		; tweaks shifted by 7-4
+	vpxord		zmm10, zmm7, zmm5
+
+	; Make next 8 tweek values by all x 2^8
+	vpsrldq		zmm13, zmm9, 15
+	vpclmulqdq	zmm14, zmm13, zpoly, 0
+	vpslldq		zmm11, zmm9, 1
+	vpxord		zmm11, zmm11, zmm14
+
+	vpsrldq		zmm15, zmm10, 15
+	vpclmulqdq	zmm16, zmm15, zpoly, 0
+	vpslldq		zmm12, zmm10, 1
+	vpxord		zmm12, zmm12, zmm16
+
+_main_loop_run_16:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2, [ptr_plaintext+16*4]
+	vmovdqu8	zmm3, [ptr_plaintext+16*8]
+	vmovdqu8	zmm4, [ptr_plaintext+16*12]
+	add		ptr_plaintext, 256
+
+	decrypt_by_16_zmm  zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
+
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], zmm2
+	vmovdqu8	[ptr_ciphertext+16*8], zmm3
+	vmovdqu8	[ptr_ciphertext+16*12], zmm4
+	add		ptr_ciphertext, 256
+	sub		N_val, 256
+	cmp		N_val, 256
+	jge		_main_loop_run_16
+
+	cmp		N_val, 128
+	jge		_main_loop_run_8
+
+	jmp		_do_n_blocks
+
+_start_by8:
+	; Make first 7 tweek values
+	vbroadcasti32x4	zmm0, [TW]
+	vbroadcasti32x4	zmm8, [shufb_15_7]
+	mov		tmp1, 0xaa
+	kmovq		k2, tmp1
+
+	; Mult tweak by 2^{3, 2, 1, 0}
+	vpshufb		zmm1, zmm0, zmm8		; mov 15->0, 7->8
+	vpsllvq		zmm4, zmm0, [const_dq3210]	; shift l 3,2,1,0
+	vpsrlvq		zmm2, zmm1, [const_dq5678]	; shift r 5,6,7,8
+	vpclmulqdq      zmm3, zmm2, zpoly, 0x00
+	vpxorq		zmm4 {k2}, zmm4, zmm2		; tweaks shifted by 3-0
+	vpxord		zmm9, zmm3, zmm4
+
+	; Mult tweak by 2^{7, 6, 5, 4}
+	vpsllvq		zmm5, zmm0, [const_dq7654]	; shift l 7,6,5,4
+	vpsrlvq		zmm6, zmm1, [const_dq1234]	; shift r 1,2,3,4
+	vpclmulqdq      zmm7, zmm6, zpoly, 0x00
+	vpxorq		zmm5 {k2}, zmm5, zmm6		; tweaks shifted by 7-4
+	vpxord		zmm10, zmm7, zmm5
+
+_main_loop_run_8:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 128
+
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 0
+
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], zmm2
+	add		ptr_ciphertext, 128
+	sub		N_val, 128
+	cmp		N_val, 128
+	jge		_main_loop_run_8
+
+	jmp		_do_n_blocks
+
+_steal_cipher:
+	; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
+	vmovdqa		xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea		twtempl, [vpshufb_shf_table]
+	vmovdqu		xmm10, [twtempl+N_val]
+	vpshufb		xmm8, xmm10
+
+	vmovdqu		xmm3, [ptr_plaintext - 16 + N_val]
+	vmovdqu		[ptr_ciphertext - 16 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea		twtempl, [vpshufb_shf_table +16]
+	sub		twtempl, N_val
+	vmovdqu		xmm10, [twtempl]
+	vpxor		xmm10, [mask1]
+	vpshufb		xmm3, xmm10
+
+	vpblendvb	xmm3, xmm3, xmm2, xmm10
+
+	; xor Tweak value
+	vpxor		xmm8, xmm3, xmm0
+
+	;decrypt last block with cipher stealing
+	vpxor		xmm8, [keys]		; ARK
+	vaesdec		xmm8, [keys + 16*1]	; round 1
+	vaesdec		xmm8, [keys + 16*2]	; round 2
+	vaesdec		xmm8, [keys + 16*3]	; round 3
+	vaesdec		xmm8, [keys + 16*4]	; round 4
+	vaesdec		xmm8, [keys + 16*5]	; round 5
+	vaesdec		xmm8, [keys + 16*6]	; round 6
+	vaesdec		xmm8, [keys + 16*7]	; round 7
+	vaesdec		xmm8, [keys + 16*8]	; round 8
+	vaesdec		xmm8, [keys + 16*9]	; round 9
+	vaesdeclast	xmm8, [keys + 16*10]	; round 10
+
+	; xor Tweak value
+	vpxor		xmm8, xmm8, xmm0
+
+_done:
+	; store last ciphertext value
+	vmovdqu		[ptr_ciphertext - 16], xmm8
+
+_ret_:
+	mov		rbx, [_gpr + 8*0]
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov		rdi, [_gpr + 8*1]
+	mov		rsi, [_gpr + 8*2]
+
+	vmovdqa		xmm6, [_xmm + 16*0]
+	vmovdqa		xmm7, [_xmm + 16*1]
+	vmovdqa		xmm8, [_xmm + 16*2]
+	vmovdqa		xmm9, [_xmm + 16*3]
+	vmovdqa		xmm10, [_xmm + 16*4]
+	vmovdqa		xmm11, [_xmm + 16*5]
+	vmovdqa		xmm12, [_xmm + 16*6]
+	vmovdqa		xmm13, [_xmm + 16*7]
+	vmovdqa		xmm14, [_xmm + 16*8]
+	vmovdqa		xmm15, [_xmm + 16*9]
+%endif
+
+%ifndef ALIGN_STACK
+	add		rsp, VARIABLE_OFFSET
+%else
+	mov		rsp, rbp
+	pop		rbp
+%endif
+	ret
+
+
+_less_than_128_bytes:
+	cmp		N_val, 16
+	jb		_ret_
+
+	mov		tmp1, N_val
+	and		tmp1, (7 << 4)
+	cmp		tmp1, (6 << 4)
+	je		_num_blocks_is_6
+	cmp		tmp1, (5 << 4)
+	je		_num_blocks_is_5
+	cmp		tmp1, (4 << 4)
+	je		_num_blocks_is_4
+	cmp		tmp1, (3 << 4)
+	je		_num_blocks_is_3
+	cmp		tmp1, (2 << 4)
+	je		_num_blocks_is_2
+	cmp		tmp1, (1 << 4)
+	je		_num_blocks_is_1
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add		ptr_plaintext, 16*7
+	and		N_val, 15
+	je		_done_7
+
+_steal_cipher_7:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa64	xmm16, xmm15
+	vmovdqa		xmm15, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+	vmovdqu		[ptr_ciphertext+16*5], xmm6
+	add		ptr_ciphertext, 16*7
+	vmovdqa64	xmm0, xmm16
+	vmovdqa		xmm8, xmm7
+	jmp		_steal_cipher
+
+_done_7:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+	vmovdqu		[ptr_ciphertext+16*5], xmm6
+	add		ptr_ciphertext, 16*7
+	vmovdqa		xmm8, xmm7
+	jmp		_done
+
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add		ptr_plaintext, 16*6
+	and		N_val, 15
+	je		 _done_6
+
+_steal_cipher_6:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm15, xmm14
+	vmovdqa		xmm14, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	add		ptr_ciphertext, 16*6
+	vmovdqa		xmm0, xmm15
+	vmovdqa		xmm8, xmm6
+	jmp		_steal_cipher
+
+_done_6:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	add		ptr_ciphertext, 16*6
+	vmovdqa		xmm8, xmm6
+	jmp		_done
+
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add		ptr_plaintext, 16*5
+	and		N_val, 15
+	je		_done_5
+
+_steal_cipher_5:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm14, xmm13
+	vmovdqa		xmm13, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	add		ptr_ciphertext, 16*5
+	vmovdqa		xmm0, xmm14
+	vmovdqa		xmm8, xmm5
+	jmp		_steal_cipher
+
+_done_5:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	add		ptr_ciphertext, 16*5
+	vmovdqa		xmm8, xmm5
+	jmp		_done
+
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add		ptr_plaintext, 16*4
+	and		N_val, 15
+	je		_done_4
+
+_steal_cipher_4:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm13, xmm12
+	vmovdqa		xmm12, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	add		ptr_ciphertext, 16*4
+	vmovdqa		xmm0, xmm13
+	vmovdqa		xmm8, xmm4
+	jmp		_steal_cipher
+
+_done_4:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	add		ptr_ciphertext, 16*4
+	vmovdqa		xmm8, xmm4
+	jmp		_done
+
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add		ptr_plaintext, 16*3
+	and		N_val, 15
+	je		_done_3
+
+_steal_cipher_3:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm12, xmm11
+	vmovdqa		xmm11, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	add		ptr_ciphertext, 16*3
+	vmovdqa		xmm0, xmm12
+	vmovdqa		xmm8, xmm3
+	jmp		_steal_cipher
+
+_done_3:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	add		ptr_ciphertext, 16*3
+	vmovdqa		xmm8, xmm3
+	jmp		_done
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add		ptr_plaintext, 16*2
+	and		N_val, 15
+	je		_done_2
+
+_steal_cipher_2:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm11, xmm10
+	vmovdqa		xmm10, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16*2
+	vmovdqa		xmm0, xmm11
+	vmovdqa		xmm8, xmm2
+	jmp		_steal_cipher
+
+_done_2:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16*2
+	vmovdqa		xmm8, xmm2
+	jmp		_done
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add		ptr_plaintext, 16*1
+	and		N_val, 15
+	je		_done_1
+
+_steal_cipher_1:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm10, xmm9
+	vmovdqa		xmm9, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	add		ptr_ciphertext, 16*1
+	vmovdqa		xmm0, xmm10
+	vmovdqa		xmm8, xmm1
+	jmp		_steal_cipher
+
+_done_1:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	add		ptr_ciphertext, 16*1
+	vmovdqa		xmm8, xmm1
+	jmp		_done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
+const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
+const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
+const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
+
+shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+%else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_XTS_AES_128_dec_vaes
+no_XTS_AES_128_dec_vaes:
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm
new file mode 100644
index 000000000..819617283
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm
@@ -0,0 +1,1531 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*19     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*19     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*29     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_avx(
+;               UINT8 *k2,      // key used for tweaking, 16*1 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*1 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *pt,        // plaintext sector input data
+;               UINT8 *ct);     // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define target_ptr_val          rsi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define target_ptr_val          rdx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro	key_expansion_128	3
+%define	%%xraw_key	%1
+%define	%%xtmp	%2
+%define	%%xround_key	%3
+	vpshufd	%%xraw_key,  %%xraw_key, 11111111b
+	shufps	%%xtmp, %%xround_key, 00010000b
+	vpxor	%%xround_key, %%xtmp
+	shufps	%%xtmp, %%xround_key, 10001100b
+	vpxor	%%xround_key, %%xtmp
+	vpxor	%%xround_key,  %%xraw_key
+%endmacro
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 8
+%define	%%xkey2	%1
+%define	%%xstate_tweak	%2
+%define	%%xkey1	%3
+%define	%%xraw_key	%4
+%define	%%xtmp	%5
+%define	%%ptr_key2	%6
+%define	%%ptr_key1	%7
+%define	%%ptr_expanded_keys	%8
+
+
+	vmovdqu	%%xkey2, [%%ptr_key2]
+	vmovdqu	%%xkey1, [%%ptr_key1]
+	vmovdqa	[%%ptr_expanded_keys+16*0], %%xkey1
+
+	vpxor	%%xstate_tweak, %%xkey2				; ARK for tweak encryption
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x1	; Generating round key 1 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x1	; Generating round key 1 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenc			%%xstate_tweak, %%xkey2		; round 1 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys+16*1], %%xkey1
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x2	; Generating round key 2 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x2	; Generating round key 2 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenc			%%xstate_tweak, %%xkey2		; round 2 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys+16*2], %%xkey1
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x4	; Generating round key 3 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x4	; Generating round key 3 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenc			%%xstate_tweak, %%xkey2		; round 3 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys + 16*3], %%xkey1
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x8	; Generating round key 4 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x8	; Generating round key 4 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenc			%%xstate_tweak, %%xkey2		; round 4 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys + 16*4], %%xkey1
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x10	; Generating round key 5 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x10	; Generating round key 5 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenc			%%xstate_tweak, %%xkey2		; round 5 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys + 16*5], %%xkey1
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x20	; Generating round key 6 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x20	; Generating round key 6 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenc			%%xstate_tweak, %%xkey2		; round 6 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys + 16*6], %%xkey1
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x40	; Generating round key 7 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x40	; Generating round key 7 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenc			%%xstate_tweak, %%xkey2		; round 7 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys + 16*7], %%xkey1
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x80	; Generating round key 8 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x80	; Generating round key 8 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenc			%%xstate_tweak, %%xkey2		; round 8 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys + 16*8], %%xkey1
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x1b	; Generating round key 9 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x1b	; Generating round key 9 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenc			%%xstate_tweak, %%xkey2		; round 9 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys + 16*9], %%xkey1
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x36	; Generating round key 10 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x36	; Generating round key 10 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenclast		%%xstate_tweak, %%xkey2		; round 10 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys + 16*10], %%xkey1
+
+	vmovdqa	[TW], %%xstate_tweak		; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		vmovdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		vmovdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		vmovdqa  %%TW2, [TW+16*1]
+		vmovdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		vmovdqa  %%TW3, [TW+16*2]
+		vmovdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		vmovdqa  %%TW4, [TW+16*3]
+		vmovdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		vmovdqa  %%TW5, [TW+16*4]
+		vmovdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		vmovdqa  %%TW6, [TW+16*5]
+		vmovdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		vmovdqa  %%TW7, [TW+16*6]
+		vmovdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro  encrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	vmovdqa  %%T0, [keys]
+	vpxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	vmovdqa  %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	vmovdqa  %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	vmovdqa  %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	vmovdqa  %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	vmovdqa  %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	vmovdqa  %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	vmovdqa  %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	vmovdqa  %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	vmovdqa  %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+
+
+
+
+	; round 10
+	vmovdqa  %%T0, [keys + 16*10]
+	vaesenclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_eight 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%TW8   %16     ; tweak 8
+%define %%T0    %17     ; Temp register
+%define %%last_eight     %18
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+	vpxor    %%ST2, %%TW2
+	vpxor    %%ST3, %%TW3
+	vpxor    %%ST4, %%TW4
+	vpxor    %%ST5, %%TW5
+	vpxor    %%ST6, %%TW6
+	vpxor    %%ST7, %%TW7
+	vpxor    %%ST8, %%TW8
+
+	; ARK
+	vmovdqa %%T0, [keys]
+	vpxor    %%ST1, %%T0
+	vpxor    %%ST2, %%T0
+	vpxor    %%ST3, %%T0
+	vpxor    %%ST4, %%T0
+	vpxor    %%ST5, %%T0
+	vpxor    %%ST6, %%T0
+	vpxor    %%ST7, %%T0
+	vpxor    %%ST8, %%T0
+
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 1
+	vmovdqa %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 2
+	vmovdqa %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+
+%endif
+	; round 3
+	vmovdqa %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*2], twtempl
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 4
+	vmovdqa %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl
+%endif
+	; round 5
+	vmovdqa %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 6
+	vmovdqa %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl
+		mov     [TW + 8*7], twtemph
+%endif
+	; round 7
+	vmovdqa %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 8
+	vmovdqa %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl
+		mov     [TW + 8*9], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 9
+	vmovdqa %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+		mov     [TW + 8*10], twtempl
+		mov     [TW + 8*11], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+		mov     [TW + 8*13], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+;               mov     [TW + 8*14], twtempl
+;               mov     [TW + 8*15], twtemph
+%endif
+	; round 10
+	vmovdqa %%T0, [keys + 16*10]
+	vaesenclast      %%ST1, %%T0
+	vaesenclast      %%ST2, %%T0
+	vaesenclast      %%ST3, %%T0
+	vaesenclast      %%ST4, %%T0
+	vaesenclast      %%ST5, %%T0
+	vaesenclast      %%ST6, %%T0
+	vaesenclast      %%ST7, %%T0
+	vaesenclast      %%ST8, %%T0
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+	vpxor    %%ST2, %%TW2
+	vpxor    %%ST3, %%TW3
+	vpxor    %%ST4, %%TW4
+	vpxor    %%ST5, %%TW5
+	vpxor    %%ST6, %%TW6
+	vpxor    %%ST7, %%TW7
+	vpxor    %%ST8, %%TW8
+
+		mov     [TW + 8*14], twtempl
+		mov     [TW + 8*15], twtemph
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_enc_avx, function
+XTS_AES_128_enc_avx:
+	endbranch
+
+	sub     rsp, VARIABLE_OFFSET
+
+	mov     [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [_gpr + 8*1], rdi
+	mov     [_gpr + 8*2], rsi
+
+	vmovdqa  [_xmm + 16*0], xmm6
+	vmovdqa  [_xmm + 16*1], xmm7
+	vmovdqa  [_xmm + 16*2], xmm8
+	vmovdqa  [_xmm + 16*3], xmm9
+	vmovdqa  [_xmm + 16*4], xmm10
+	vmovdqa  [_xmm + 16*5], xmm11
+	vmovdqa  [_xmm + 16*6], xmm12
+	vmovdqa  [_xmm + 16*7], xmm13
+	vmovdqa  [_xmm + 16*8], xmm14
+	vmovdqa  [_xmm + 16*9], xmm15
+%endif
+
+	mov     ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	vmovdqu  xmm1, [T_val]                   ; read initial Tweak value
+	vpxor    xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]            ; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]           ; ciphertext pointer
+%endif
+
+
+
+	mov             target_ptr_val, N_val
+	and             target_ptr_val, -16             ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+	sub             target_ptr_val, 128             ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+	jl              _less_than_128_bytes
+
+	add             target_ptr_val, ptr_ciphertext
+
+
+	mov             tmp1, N_val
+	and             tmp1, (7 << 4)
+	jz              _initial_num_blocks_is_0
+
+	cmp             tmp1, (4 << 4)
+	je              _initial_num_blocks_is_4
+
+
+
+	cmp             tmp1, (6 << 4)
+	je              _initial_num_blocks_is_6
+
+	cmp             tmp1, (5 << 4)
+	je              _initial_num_blocks_is_5
+
+
+
+	cmp             tmp1, (3 << 4)
+	je              _initial_num_blocks_is_3
+
+	cmp             tmp1, (2 << 4)
+	je              _initial_num_blocks_is_2
+
+	cmp             tmp1, (1 << 4)
+	je              _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	add     ptr_ciphertext, 16*7
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	add     ptr_ciphertext, 16*6
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	add     ptr_ciphertext, 16*5
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	add     ptr_ciphertext, 16*4
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+
+_initial_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	add     ptr_ciphertext, 16*3
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+	vmovdqu  [ptr_ciphertext+16], xmm2
+	add     ptr_ciphertext, 16*2
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+	add     ptr_ciphertext, 16
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_0:
+		mov             twtempl, [TW+8*0]
+		mov             twtemph, [TW+8*1]
+		vmovdqa          xmm9, [TW+16*0]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*2], twtempl
+		mov             [TW+8*3], twtemph
+		vmovdqa          xmm10, [TW+16*1]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*4], twtempl
+		mov             [TW+8*5], twtemph
+		vmovdqa          xmm11, [TW+16*2]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*6], twtempl
+		mov             [TW+8*7], twtemph
+		vmovdqa          xmm12, [TW+16*3]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*8], twtempl
+		mov             [TW+8*9], twtemph
+		vmovdqa          xmm13, [TW+16*4]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*10], twtempl
+		mov             [TW+8*11], twtemph
+		vmovdqa          xmm14, [TW+16*5]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*12], twtempl
+		mov             [TW+8*13], twtemph
+		vmovdqa          xmm15, [TW+16*6]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*14], twtempl
+		mov             [TW+8*15], twtemph
+		;vmovdqa         xmm16, [TW+16*7]
+
+		cmp             ptr_ciphertext, target_ptr_val
+		je              _last_eight
+_main_loop:
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+
+	add     ptr_plaintext, 128
+
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	vmovdqu  [ptr_ciphertext+16*7], xmm8
+	add     ptr_ciphertext, 128
+
+	cmp     ptr_ciphertext, target_ptr_val
+	jne     _main_loop
+
+_last_eight:
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+_steal_cipher:
+	; start cipher stealing
+
+	; generate next Tweak value
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW], twtempl
+	mov     [TW + 8], twtemph
+
+	vmovdqa  xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea     twtempl, [vpshufb_shf_table]
+	vmovdqu  xmm0, [twtempl+N_val]
+	vpshufb  xmm8, xmm0
+
+
+	vmovdqu  xmm3, [ptr_plaintext + 112 + N_val]      ; state register is temporarily xmm3 to eliminate a move
+	vmovdqu  [ptr_ciphertext + 112 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea     twtempl, [vpshufb_shf_table +16]
+	sub     twtempl, N_val
+	vmovdqu  xmm0, [twtempl]
+	vpxor    xmm0, [mask1]
+	vpshufb  xmm3, xmm0
+
+	vpblendvb       xmm3, xmm3, xmm2, xmm0      ;xmm0 is implicit
+
+	; xor Tweak value
+	vmovdqa  xmm8, [TW]
+	vpxor    xmm8, xmm3      ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+	;encrypt last block with cipher stealing
+	vpxor    xmm8, [keys]                    ; ARK
+	vaesenc  xmm8, [keys + 16*1]             ; round 1
+	vaesenc  xmm8, [keys + 16*2]             ; round 2
+	vaesenc  xmm8, [keys + 16*3]             ; round 3
+	vaesenc  xmm8, [keys + 16*4]             ; round 4
+	vaesenc  xmm8, [keys + 16*5]             ; round 5
+	vaesenc  xmm8, [keys + 16*6]             ; round 6
+	vaesenc  xmm8, [keys + 16*7]             ; round 7
+	vaesenc  xmm8, [keys + 16*8]             ; round 8
+	vaesenc  xmm8, [keys + 16*9]             ; round 9
+	vaesenclast      xmm8, [keys + 16*10]    ; round 10
+
+	; xor Tweak value
+	vpxor    xmm8, [TW]
+
+_done:
+	; store last ciphertext value
+	vmovdqu  [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+	mov     rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rdi, [_gpr + 8*1]
+	mov     rsi, [_gpr + 8*2]
+
+
+	vmovdqa  xmm6, [_xmm + 16*0]
+	vmovdqa  xmm7, [_xmm + 16*1]
+	vmovdqa  xmm8, [_xmm + 16*2]
+	vmovdqa  xmm9, [_xmm + 16*3]
+	vmovdqa  xmm10, [_xmm + 16*4]
+	vmovdqa  xmm11, [_xmm + 16*5]
+	vmovdqa  xmm12, [_xmm + 16*6]
+	vmovdqa  xmm13, [_xmm + 16*7]
+	vmovdqa  xmm14, [_xmm + 16*8]
+	vmovdqa  xmm15, [_xmm + 16*9]
+%endif
+
+	add     rsp, VARIABLE_OFFSET
+
+	ret
+
+
+
+
+
+_less_than_128_bytes:
+	cmp N_val, 16
+	jb _ret_
+
+	mov     tmp1, N_val
+	and     tmp1, (7 << 4)
+	cmp     tmp1, (6 << 4)
+	je      _num_blocks_is_6
+	cmp     tmp1, (5 << 4)
+	je      _num_blocks_is_5
+	cmp     tmp1, (4 << 4)
+	je      _num_blocks_is_4
+	cmp     tmp1, (3 << 4)
+	je      _num_blocks_is_3
+	cmp     tmp1, (2 << 4)
+	je      _num_blocks_is_2
+	cmp     tmp1, (1 << 4)
+	je      _num_blocks_is_1
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	sub     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	vmovdqa  xmm8, xmm7
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	sub     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	vmovdqa  xmm8, xmm6
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	sub     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	vmovdqa  xmm8, xmm5
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	sub     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	vmovdqa  xmm8, xmm4
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	sub     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	vmovdqa  xmm8, xmm3
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	sub     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	vmovdqa  xmm8, xmm2
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+	sub     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	vmovdqa  xmm8, xmm1
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm
new file mode 100644
index 000000000..f0f5f02f5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm
@@ -0,0 +1,1506 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+default rel
+
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*19     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*19     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*29     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_expanded_key_avx(
+;               UINT8 *k2,      // key used for tweaking, 16*11 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*11 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *pt,        // plaintext sector input data
+;               UINT8 *ct);     // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define target_ptr_val          rsi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define target_ptr_val          rdx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro  encrypt_T 8
+%define %%xkey2         %1
+%define %%xstate_tweak  %2
+%define %%xkey1         %3
+%define %%xraw_key      %4
+%define %%xtmp          %5
+%define %%ptr_key2      %6
+%define %%ptr_key1      %7
+%define %%ptr_expanded_keys     %8
+
+	vmovdqu  %%xkey2, [%%ptr_key2]
+	vpxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1]
+	vmovdqa  [%%ptr_expanded_keys+16*0], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*1]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 1 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*1]
+	vmovdqa  [%%ptr_expanded_keys+16*1], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*2]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 2 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*2]
+	vmovdqa  [%%ptr_expanded_keys+16*2], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*3]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 3 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*3]
+	vmovdqa  [%%ptr_expanded_keys+16*3], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*4]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 4 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*4]
+	vmovdqa  [%%ptr_expanded_keys+16*4], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*5]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 5 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*5]
+	vmovdqa  [%%ptr_expanded_keys+16*5], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*6]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 6 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*6]
+	vmovdqa  [%%ptr_expanded_keys+16*6], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*7]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 7 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*7]
+	vmovdqa  [%%ptr_expanded_keys+16*7], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*8]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 8 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*8]
+	vmovdqa  [%%ptr_expanded_keys+16*8], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*9]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 9 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*9]
+	vmovdqa  [%%ptr_expanded_keys+16*9], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*10]
+	vaesenclast      %%xstate_tweak, %%xkey2                 ; round 10 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*10]
+	vmovdqa  [%%ptr_expanded_keys+16*10], %%xkey1            ; store round keys in stack
+
+	vmovdqa  [TW], %%xstate_tweak                            ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		vmovdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		vmovdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		vmovdqa  %%TW2, [TW+16*1]
+		vmovdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		vmovdqa  %%TW3, [TW+16*2]
+		vmovdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		vmovdqa  %%TW4, [TW+16*3]
+		vmovdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		vmovdqa  %%TW5, [TW+16*4]
+		vmovdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		vmovdqa  %%TW6, [TW+16*5]
+		vmovdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		vmovdqa  %%TW7, [TW+16*6]
+		vmovdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro  encrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	vmovdqa  %%T0, [keys]
+	vpxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	vmovdqa  %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	vmovdqa  %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	vmovdqa  %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	vmovdqa  %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	vmovdqa  %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	vmovdqa  %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	vmovdqa  %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	vmovdqa  %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	vmovdqa  %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+
+
+
+
+	; round 10
+	vmovdqa  %%T0, [keys + 16*10]
+	vaesenclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_eight 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%TW8   %16     ; tweak 8
+%define %%T0    %17     ; Temp register
+%define %%last_eight     %18
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+	vpxor    %%ST2, %%TW2
+	vpxor    %%ST3, %%TW3
+	vpxor    %%ST4, %%TW4
+	vpxor    %%ST5, %%TW5
+	vpxor    %%ST6, %%TW6
+	vpxor    %%ST7, %%TW7
+	vpxor    %%ST8, %%TW8
+
+	; ARK
+	vmovdqa %%T0, [keys]
+	vpxor    %%ST1, %%T0
+	vpxor    %%ST2, %%T0
+	vpxor    %%ST3, %%T0
+	vpxor    %%ST4, %%T0
+	vpxor    %%ST5, %%T0
+	vpxor    %%ST6, %%T0
+	vpxor    %%ST7, %%T0
+	vpxor    %%ST8, %%T0
+
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 1
+	vmovdqa %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 2
+	vmovdqa %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+
+%endif
+	; round 3
+	vmovdqa %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*2], twtempl
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 4
+	vmovdqa %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl
+%endif
+	; round 5
+	vmovdqa %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 6
+	vmovdqa %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl
+		mov     [TW + 8*7], twtemph
+%endif
+	; round 7
+	vmovdqa %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 8
+	vmovdqa %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl
+		mov     [TW + 8*9], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 9
+	vmovdqa %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+		mov     [TW + 8*10], twtempl
+		mov     [TW + 8*11], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+		mov     [TW + 8*13], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+;               mov     [TW + 8*14], twtempl
+;               mov     [TW + 8*15], twtemph
+%endif
+	; round 10
+	vmovdqa %%T0, [keys + 16*10]
+	vaesenclast      %%ST1, %%T0
+	vaesenclast      %%ST2, %%T0
+	vaesenclast      %%ST3, %%T0
+	vaesenclast      %%ST4, %%T0
+	vaesenclast      %%ST5, %%T0
+	vaesenclast      %%ST6, %%T0
+	vaesenclast      %%ST7, %%T0
+	vaesenclast      %%ST8, %%T0
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+	vpxor    %%ST2, %%TW2
+	vpxor    %%ST3, %%TW3
+	vpxor    %%ST4, %%TW4
+	vpxor    %%ST5, %%TW5
+	vpxor    %%ST6, %%TW6
+	vpxor    %%ST7, %%TW7
+	vpxor    %%ST8, %%TW8
+
+		mov     [TW + 8*14], twtempl
+		mov     [TW + 8*15], twtemph
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_enc_expanded_key_avx, function
+XTS_AES_128_enc_expanded_key_avx:
+	endbranch
+
+	sub     rsp, VARIABLE_OFFSET
+
+	mov     [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [_gpr + 8*1], rdi
+	mov     [_gpr + 8*2], rsi
+
+	vmovdqa  [_xmm + 16*0], xmm6
+	vmovdqa  [_xmm + 16*1], xmm7
+	vmovdqa  [_xmm + 16*2], xmm8
+	vmovdqa  [_xmm + 16*3], xmm9
+	vmovdqa  [_xmm + 16*4], xmm10
+	vmovdqa  [_xmm + 16*5], xmm11
+	vmovdqa  [_xmm + 16*6], xmm12
+	vmovdqa  [_xmm + 16*7], xmm13
+	vmovdqa  [_xmm + 16*8], xmm14
+	vmovdqa  [_xmm + 16*9], xmm15
+%endif
+
+	mov     ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	vmovdqu  xmm1, [T_val]                   ; read initial Tweak value
+	vpxor    xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]            ; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]           ; ciphertext pointer
+%endif
+
+
+
+	mov             target_ptr_val, N_val
+	and             target_ptr_val, -16             ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+	sub             target_ptr_val, 128             ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+	jl              _less_than_128_bytes
+
+	add             target_ptr_val, ptr_ciphertext
+
+
+	mov             tmp1, N_val
+	and             tmp1, (7 << 4)
+	jz              _initial_num_blocks_is_0
+
+	cmp             tmp1, (4 << 4)
+	je              _initial_num_blocks_is_4
+
+
+
+	cmp             tmp1, (6 << 4)
+	je              _initial_num_blocks_is_6
+
+	cmp             tmp1, (5 << 4)
+	je              _initial_num_blocks_is_5
+
+
+
+	cmp             tmp1, (3 << 4)
+	je              _initial_num_blocks_is_3
+
+	cmp             tmp1, (2 << 4)
+	je              _initial_num_blocks_is_2
+
+	cmp             tmp1, (1 << 4)
+	je              _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	add     ptr_ciphertext, 16*7
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	add     ptr_ciphertext, 16*6
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	add     ptr_ciphertext, 16*5
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	add     ptr_ciphertext, 16*4
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+
+_initial_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	add     ptr_ciphertext, 16*3
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+	vmovdqu  [ptr_ciphertext+16], xmm2
+	add     ptr_ciphertext, 16*2
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+	add     ptr_ciphertext, 16
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_0:
+		mov             twtempl, [TW+8*0]
+		mov             twtemph, [TW+8*1]
+		vmovdqa          xmm9, [TW+16*0]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*2], twtempl
+		mov             [TW+8*3], twtemph
+		vmovdqa          xmm10, [TW+16*1]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*4], twtempl
+		mov             [TW+8*5], twtemph
+		vmovdqa          xmm11, [TW+16*2]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*6], twtempl
+		mov             [TW+8*7], twtemph
+		vmovdqa          xmm12, [TW+16*3]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*8], twtempl
+		mov             [TW+8*9], twtemph
+		vmovdqa          xmm13, [TW+16*4]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*10], twtempl
+		mov             [TW+8*11], twtemph
+		vmovdqa          xmm14, [TW+16*5]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*12], twtempl
+		mov             [TW+8*13], twtemph
+		vmovdqa          xmm15, [TW+16*6]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*14], twtempl
+		mov             [TW+8*15], twtemph
+		;vmovdqa         xmm16, [TW+16*7]
+
+		cmp             ptr_ciphertext, target_ptr_val
+		je              _last_eight
+_main_loop:
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+
+	add     ptr_plaintext, 128
+
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	vmovdqu  [ptr_ciphertext+16*7], xmm8
+	add     ptr_ciphertext, 128
+
+	cmp     ptr_ciphertext, target_ptr_val
+	jne     _main_loop
+
+_last_eight:
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+_steal_cipher:
+	; start cipher stealing
+
+	; generate next Tweak value
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW], twtempl
+	mov     [TW + 8], twtemph
+
+	vmovdqa  xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea     twtempl, [vpshufb_shf_table]
+	vmovdqu  xmm0, [twtempl+N_val]
+	vpshufb  xmm8, xmm0
+
+
+	vmovdqu  xmm3, [ptr_plaintext + 112 + N_val]      ; state register is temporarily xmm3 to eliminate a move
+	vmovdqu  [ptr_ciphertext + 112 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea     twtempl, [vpshufb_shf_table +16]
+	sub     twtempl, N_val
+	vmovdqu  xmm0, [twtempl]
+	vpxor    xmm0, [mask1]
+	vpshufb  xmm3, xmm0
+
+	vpblendvb       xmm3, xmm3, xmm2, xmm0      ;xmm0 is implicit
+
+	; xor Tweak value
+	vmovdqa  xmm8, [TW]
+	vpxor    xmm8, xmm3      ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+	;encrypt last block with cipher stealing
+	vpxor    xmm8, [keys]                    ; ARK
+	vaesenc  xmm8, [keys + 16*1]             ; round 1
+	vaesenc  xmm8, [keys + 16*2]             ; round 2
+	vaesenc  xmm8, [keys + 16*3]             ; round 3
+	vaesenc  xmm8, [keys + 16*4]             ; round 4
+	vaesenc  xmm8, [keys + 16*5]             ; round 5
+	vaesenc  xmm8, [keys + 16*6]             ; round 6
+	vaesenc  xmm8, [keys + 16*7]             ; round 7
+	vaesenc  xmm8, [keys + 16*8]             ; round 8
+	vaesenc  xmm8, [keys + 16*9]             ; round 9
+	vaesenclast      xmm8, [keys + 16*10]    ; round 10
+
+	; xor Tweak value
+	vpxor    xmm8, [TW]
+
+_done:
+	; store last ciphertext value
+	vmovdqu  [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+	mov     rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rdi, [_gpr + 8*1]
+	mov     rsi, [_gpr + 8*2]
+
+
+	vmovdqa  xmm6, [_xmm + 16*0]
+	vmovdqa  xmm7, [_xmm + 16*1]
+	vmovdqa  xmm8, [_xmm + 16*2]
+	vmovdqa  xmm9, [_xmm + 16*3]
+	vmovdqa  xmm10, [_xmm + 16*4]
+	vmovdqa  xmm11, [_xmm + 16*5]
+	vmovdqa  xmm12, [_xmm + 16*6]
+	vmovdqa  xmm13, [_xmm + 16*7]
+	vmovdqa  xmm14, [_xmm + 16*8]
+	vmovdqa  xmm15, [_xmm + 16*9]
+%endif
+
+	add     rsp, VARIABLE_OFFSET
+
+	ret
+
+
+
+
+
+_less_than_128_bytes:
+	cmp N_val, 16
+	jb _ret_
+
+	mov     tmp1, N_val
+	and     tmp1, (7 << 4)
+	cmp     tmp1, (6 << 4)
+	je      _num_blocks_is_6
+	cmp     tmp1, (5 << 4)
+	je      _num_blocks_is_5
+	cmp     tmp1, (4 << 4)
+	je      _num_blocks_is_4
+	cmp     tmp1, (3 << 4)
+	je      _num_blocks_is_3
+	cmp     tmp1, (2 << 4)
+	je      _num_blocks_is_2
+	cmp     tmp1, (1 << 4)
+	je      _num_blocks_is_1
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	sub     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	vmovdqa  xmm8, xmm7
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	sub     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	vmovdqa  xmm8, xmm6
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	sub     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	vmovdqa  xmm8, xmm5
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	sub     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	vmovdqa  xmm8, xmm4
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	sub     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	vmovdqa  xmm8, xmm3
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	sub     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	vmovdqa  xmm8, xmm2
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+	sub     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	vmovdqa  xmm8, xmm1
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm
new file mode 100644
index 000000000..8ac162c4c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm
@@ -0,0 +1,1505 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*19     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*19     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*29     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_expanded_key_sse(
+;               UINT8 *k2,      // key used for tweaking, 16*11 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*11 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *pt,        // plaintext sector input data
+;               UINT8 *ct);     // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define target_ptr_val          rsi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define target_ptr_val          rdx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro  encrypt_T 8
+%define %%xkey2         %1
+%define %%xstate_tweak  %2
+%define %%xkey1         %3
+%define %%xraw_key      %4
+%define %%xtmp          %5
+%define %%ptr_key2      %6
+%define %%ptr_key1      %7
+%define %%ptr_expanded_keys     %8
+
+	movdqu  %%xkey2, [%%ptr_key2]
+	pxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1]
+	movdqa  [%%ptr_expanded_keys+16*0], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*1]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 1 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*1]
+	movdqa  [%%ptr_expanded_keys+16*1], %%xkey1             ; store round keys in stack
+
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*2]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 2 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*2]
+	movdqa  [%%ptr_expanded_keys+16*2], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*3]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 3 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*3]
+	movdqa  [%%ptr_expanded_keys+16*3], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*4]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 4 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*4]
+	movdqa  [%%ptr_expanded_keys+16*4], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*5]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 5 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*5]
+	movdqa  [%%ptr_expanded_keys+16*5], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*6]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 6 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*6]
+	movdqa  [%%ptr_expanded_keys+16*6], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*7]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 7 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*7]
+	movdqa  [%%ptr_expanded_keys+16*7], %%xkey1             ; store round keys in stack
+
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*8]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 8 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*8]
+	movdqa  [%%ptr_expanded_keys+16*8], %%xkey1             ; store round keys in stack
+
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*9]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 9 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*9]
+	movdqa  [%%ptr_expanded_keys+16*9], %%xkey1             ; store round keys in stack
+
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*10]
+	aesenclast      %%xstate_tweak, %%xkey2                 ; round 10 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*10]
+	movdqa  [%%ptr_expanded_keys+16*10], %%xkey1            ; store round keys in stack
+
+	movdqa  [TW], %%xstate_tweak                            ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		movdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		movdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		movdqa  %%TW2, [TW+16*1]
+		movdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		movdqa  %%TW3, [TW+16*2]
+		movdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		movdqa  %%TW4, [TW+16*3]
+		movdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		movdqa  %%TW5, [TW+16*4]
+		movdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		movdqa  %%TW6, [TW+16*5]
+		movdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		movdqa  %%TW7, [TW+16*6]
+		movdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro  encrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	pxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	movdqa  %%T0, [keys]
+	pxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	movdqa  %%T0, [keys + 16*1]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	movdqa  %%T0, [keys + 16*2]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	movdqa  %%T0, [keys + 16*3]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	movdqa  %%T0, [keys + 16*4]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	movdqa  %%T0, [keys + 16*5]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	movdqa  %%T0, [keys + 16*6]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	movdqa  %%T0, [keys + 16*7]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	movdqa  %%T0, [keys + 16*8]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	movdqa  %%T0, [keys + 16*9]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+
+
+
+
+	; round 10
+	movdqa  %%T0, [keys + 16*10]
+	aesenclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		movdqa  %%TW1, [TW + 16*0]
+		movdqa  %%TW2, [TW + 16*1]
+		movdqa  %%TW3, [TW + 16*2]
+		movdqa  %%TW4, [TW + 16*3]
+		movdqa  %%TW5, [TW + 16*4]
+		movdqa  %%TW6, [TW + 16*5]
+		movdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_eight 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%TW8   %16     ; tweak 8
+%define %%T0    %17     ; Temp register
+%define %%last_eight     %18
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+	pxor    %%ST2, %%TW2
+	pxor    %%ST3, %%TW3
+	pxor    %%ST4, %%TW4
+	pxor    %%ST5, %%TW5
+	pxor    %%ST6, %%TW6
+	pxor    %%ST7, %%TW7
+	pxor    %%ST8, %%TW8
+
+	; ARK
+	movdqa %%T0, [keys]
+	pxor    %%ST1, %%T0
+	pxor    %%ST2, %%T0
+	pxor    %%ST3, %%T0
+	pxor    %%ST4, %%T0
+	pxor    %%ST5, %%T0
+	pxor    %%ST6, %%T0
+	pxor    %%ST7, %%T0
+	pxor    %%ST8, %%T0
+
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 1
+	movdqa %%T0, [keys + 16*1]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 2
+	movdqa %%T0, [keys + 16*2]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+
+%endif
+	; round 3
+	movdqa %%T0, [keys + 16*3]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*2], twtempl
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 4
+	movdqa %%T0, [keys + 16*4]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl
+%endif
+	; round 5
+	movdqa %%T0, [keys + 16*5]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 6
+	movdqa %%T0, [keys + 16*6]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl
+		mov     [TW + 8*7], twtemph
+%endif
+	; round 7
+	movdqa %%T0, [keys + 16*7]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 8
+	movdqa %%T0, [keys + 16*8]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl
+		mov     [TW + 8*9], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 9
+	movdqa %%T0, [keys + 16*9]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+		mov     [TW + 8*10], twtempl
+		mov     [TW + 8*11], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+		mov     [TW + 8*13], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+;               mov     [TW + 8*14], twtempl
+;               mov     [TW + 8*15], twtemph
+%endif
+	; round 10
+	movdqa %%T0, [keys + 16*10]
+	aesenclast      %%ST1, %%T0
+	aesenclast      %%ST2, %%T0
+	aesenclast      %%ST3, %%T0
+	aesenclast      %%ST4, %%T0
+	aesenclast      %%ST5, %%T0
+	aesenclast      %%ST6, %%T0
+	aesenclast      %%ST7, %%T0
+	aesenclast      %%ST8, %%T0
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+	pxor    %%ST2, %%TW2
+	pxor    %%ST3, %%TW3
+	pxor    %%ST4, %%TW4
+	pxor    %%ST5, %%TW5
+	pxor    %%ST6, %%TW6
+	pxor    %%ST7, %%TW7
+	pxor    %%ST8, %%TW8
+
+		mov     [TW + 8*14], twtempl
+		mov     [TW + 8*15], twtemph
+		; load next Tweak values
+		movdqa  %%TW1, [TW + 16*0]
+		movdqa  %%TW2, [TW + 16*1]
+		movdqa  %%TW3, [TW + 16*2]
+		movdqa  %%TW4, [TW + 16*3]
+		movdqa  %%TW5, [TW + 16*4]
+		movdqa  %%TW6, [TW + 16*5]
+		movdqa  %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_enc_expanded_key_sse, function
+XTS_AES_128_enc_expanded_key_sse:
+	endbranch
+
+	sub     rsp, VARIABLE_OFFSET
+
+	mov     [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [_gpr + 8*1], rdi
+	mov     [_gpr + 8*2], rsi
+
+	movdqa  [_xmm + 16*0], xmm6
+	movdqa  [_xmm + 16*1], xmm7
+	movdqa  [_xmm + 16*2], xmm8
+	movdqa  [_xmm + 16*3], xmm9
+	movdqa  [_xmm + 16*4], xmm10
+	movdqa  [_xmm + 16*5], xmm11
+	movdqa  [_xmm + 16*6], xmm12
+	movdqa  [_xmm + 16*7], xmm13
+	movdqa  [_xmm + 16*8], xmm14
+	movdqa  [_xmm + 16*9], xmm15
+%endif
+
+	mov     ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	movdqu  xmm1, [T_val]                   ; read initial Tweak value
+	pxor    xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]            ; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]           ; ciphertext pointer
+%endif
+
+
+
+	mov             target_ptr_val, N_val
+	and             target_ptr_val, -16             ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+	sub             target_ptr_val, 128             ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+	jl              _less_than_128_bytes
+
+	add             target_ptr_val, ptr_ciphertext
+
+
+	mov             tmp1, N_val
+	and             tmp1, (7 << 4)
+	jz              _initial_num_blocks_is_0
+
+	cmp             tmp1, (4 << 4)
+	je              _initial_num_blocks_is_4
+
+
+
+	cmp             tmp1, (6 << 4)
+	je              _initial_num_blocks_is_6
+
+	cmp             tmp1, (5 << 4)
+	je              _initial_num_blocks_is_5
+
+
+
+	cmp             tmp1, (3 << 4)
+	je              _initial_num_blocks_is_3
+
+	cmp             tmp1, (2 << 4)
+	je              _initial_num_blocks_is_2
+
+	cmp             tmp1, (1 << 4)
+	je              _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	add     ptr_ciphertext, 16*7
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	add     ptr_ciphertext, 16*6
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	add     ptr_ciphertext, 16*5
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	add     ptr_ciphertext, 16*4
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+
+_initial_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	add     ptr_ciphertext, 16*3
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+	movdqu  [ptr_ciphertext+16], xmm2
+	add     ptr_ciphertext, 16*2
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+	add     ptr_ciphertext, 16
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_0:
+		mov             twtempl, [TW+8*0]
+		mov             twtemph, [TW+8*1]
+		movdqa          xmm9, [TW+16*0]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*2], twtempl
+		mov             [TW+8*3], twtemph
+		movdqa          xmm10, [TW+16*1]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*4], twtempl
+		mov             [TW+8*5], twtemph
+		movdqa          xmm11, [TW+16*2]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*6], twtempl
+		mov             [TW+8*7], twtemph
+		movdqa          xmm12, [TW+16*3]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*8], twtempl
+		mov             [TW+8*9], twtemph
+		movdqa          xmm13, [TW+16*4]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*10], twtempl
+		mov             [TW+8*11], twtemph
+		movdqa          xmm14, [TW+16*5]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*12], twtempl
+		mov             [TW+8*13], twtemph
+		movdqa          xmm15, [TW+16*6]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*14], twtempl
+		mov             [TW+8*15], twtemph
+		;movdqa         xmm16, [TW+16*7]
+
+		cmp             ptr_ciphertext, target_ptr_val
+		je              _last_eight
+_main_loop:
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+
+	add     ptr_plaintext, 128
+
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	movdqu  [ptr_ciphertext+16*7], xmm8
+	add     ptr_ciphertext, 128
+
+	cmp     ptr_ciphertext, target_ptr_val
+	jne     _main_loop
+
+_last_eight:
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+_steal_cipher:
+	; start cipher stealing
+
+	; generate next Tweak value
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW], twtempl
+	mov     [TW + 8], twtemph
+
+	movdqa  xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea     twtempl, [pshufb_shf_table]
+	movdqu  xmm0, [twtempl+N_val]
+	pshufb  xmm8, xmm0
+
+
+	movdqu  xmm3, [ptr_plaintext + 112 + N_val]      ; state register is temporarily xmm3 to eliminate a move
+	movdqu  [ptr_ciphertext + 112 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea     twtempl, [pshufb_shf_table +16]
+	sub     twtempl, N_val
+	movdqu  xmm0, [twtempl]
+	pxor    xmm0, [mask1]
+	pshufb  xmm3, xmm0
+
+	pblendvb        xmm3, xmm2      ;xmm0 is implicit
+
+	; xor Tweak value
+	movdqa  xmm8, [TW]
+	pxor    xmm8, xmm3      ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+	;encrypt last block with cipher stealing
+	pxor    xmm8, [keys]                    ; ARK
+	aesenc  xmm8, [keys + 16*1]             ; round 1
+	aesenc  xmm8, [keys + 16*2]             ; round 2
+	aesenc  xmm8, [keys + 16*3]             ; round 3
+	aesenc  xmm8, [keys + 16*4]             ; round 4
+	aesenc  xmm8, [keys + 16*5]             ; round 5
+	aesenc  xmm8, [keys + 16*6]             ; round 6
+	aesenc  xmm8, [keys + 16*7]             ; round 7
+	aesenc  xmm8, [keys + 16*8]             ; round 8
+	aesenc  xmm8, [keys + 16*9]             ; round 9
+	aesenclast      xmm8, [keys + 16*10]    ; round 10
+
+	; xor Tweak value
+	pxor    xmm8, [TW]
+
+_done:
+	; store last ciphertext value
+	movdqu  [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+	mov     rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rdi, [_gpr + 8*1]
+	mov     rsi, [_gpr + 8*2]
+
+
+	movdqa  xmm6, [_xmm + 16*0]
+	movdqa  xmm7, [_xmm + 16*1]
+	movdqa  xmm8, [_xmm + 16*2]
+	movdqa  xmm9, [_xmm + 16*3]
+	movdqa  xmm10, [_xmm + 16*4]
+	movdqa  xmm11, [_xmm + 16*5]
+	movdqa  xmm12, [_xmm + 16*6]
+	movdqa  xmm13, [_xmm + 16*7]
+	movdqa  xmm14, [_xmm + 16*8]
+	movdqa  xmm15, [_xmm + 16*9]
+%endif
+
+	add     rsp, VARIABLE_OFFSET
+
+	ret
+
+
+
+
+
+_less_than_128_bytes:
+	cmp N_val, 16
+	jb _ret_
+
+	mov     tmp1, N_val
+	and     tmp1, (7 << 4)
+	cmp     tmp1, (6 << 4)
+	je      _num_blocks_is_6
+	cmp     tmp1, (5 << 4)
+	je      _num_blocks_is_5
+	cmp     tmp1, (4 << 4)
+	je      _num_blocks_is_4
+	cmp     tmp1, (3 << 4)
+	je      _num_blocks_is_3
+	cmp     tmp1, (2 << 4)
+	je      _num_blocks_is_2
+	cmp     tmp1, (1 << 4)
+	je      _num_blocks_is_1
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	sub     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	movdqa  xmm8, xmm7
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	sub     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	movdqa  xmm8, xmm6
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	sub     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	movdqa  xmm8, xmm5
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	sub     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	movdqa  xmm8, xmm4
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	sub     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	movdqa  xmm8, xmm3
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	sub     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	movdqa  xmm8, xmm2
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+	sub     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	movdqa  xmm8, xmm1
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_vaes.asm
new file mode 100644
index 000000000..730fdcba9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_vaes.asm
@@ -0,0 +1,1473 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; expanded keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*23     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*23     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*33     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_expanded_key_vaes(
+;               UINT8 *k2,      // key used for tweaking, 16*2 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*2 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *pt,        // plaintext sector input data
+;               UINT8 *ct);     // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+%define zpoly   zmm25
+
+
+; macro to encrypt the tweak value
+
+%macro  encrypt_T 8
+%define %%xkey2         %1
+%define %%xstate_tweak  %2
+%define %%xkey1         %3
+%define %%xraw_key      %4
+%define %%xtmp          %5
+%define %%ptr_key2      %6
+%define %%ptr_key1      %7
+%define %%ptr_expanded_keys     %8
+
+	vmovdqu  %%xkey2, [%%ptr_key2]
+	vpxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1]
+	vmovdqa  [%%ptr_expanded_keys+16*0], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*1]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 1 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*1]
+	vmovdqa  [%%ptr_expanded_keys+16*1], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*2]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 2 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*2]
+	vmovdqa  [%%ptr_expanded_keys+16*2], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*3]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 3 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*3]
+	vmovdqa  [%%ptr_expanded_keys+16*3], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*4]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 4 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*4]
+	vmovdqa  [%%ptr_expanded_keys+16*4], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*5]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 5 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*5]
+	vmovdqa  [%%ptr_expanded_keys+16*5], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*6]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 6 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*6]
+	vmovdqa  [%%ptr_expanded_keys+16*6], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*7]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 7 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*7]
+	vmovdqa  [%%ptr_expanded_keys+16*7], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*8]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 8 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*8]
+	vmovdqa  [%%ptr_expanded_keys+16*8], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*9]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 9 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*9]
+	vmovdqa  [%%ptr_expanded_keys+16*9], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*10]
+	vaesenclast      %%xstate_tweak, %%xkey2                 ; round 10 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*10]
+	vmovdqa  [%%ptr_expanded_keys+16*10], %%xkey1            ; store round keys in stack
+
+	vmovdqa  [TW], %%xstate_tweak                            ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		vmovdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		vmovdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		vmovdqa  %%TW2, [TW+16*1]
+		vmovdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		vmovdqa  %%TW3, [TW+16*2]
+		vmovdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		vmovdqa  %%TW4, [TW+16*3]
+		vmovdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		vmovdqa  %%TW5, [TW+16*4]
+		vmovdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		vmovdqa  %%TW6, [TW+16*5]
+		vmovdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		vmovdqa  %%TW7, [TW+16*6]
+		vmovdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro  encrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	vmovdqa  %%T0, [keys]
+	vpxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	vmovdqa  %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	vmovdqa  %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	vmovdqa  %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	vmovdqa  %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	vmovdqa  %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	vmovdqa  %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	vmovdqa  %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	vmovdqa  %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	vmovdqa  %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+
+
+	; round 10
+	vmovdqa  %%T0, [keys + 16*10]
+	vaesenclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenclast      %%ST7, %%T0
+%endif
+
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_eight_zmm 6
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%TW1   %3      ; tweak 1
+%define %%TW2   %4      ; tweak 2
+%define %%T0    %5     ; Temp register
+%define %%last_eight     %6
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+
+	; ARK
+	vbroadcasti32x4 %%T0, [keys]
+	vpxorq    %%ST1, %%T0
+	vpxorq    %%ST2, %%T0
+
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW1, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm15, %%TW1, 1
+		vpxord		zmm15, zmm15, zmm14
+%endif
+	; round 1
+	vbroadcasti32x4 %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 2
+	vbroadcasti32x4 %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 3
+	vbroadcasti32x4 %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW2, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm16, %%TW2, 1
+		vpxord		zmm16, zmm16, zmm14
+%endif
+	; round 4
+	vbroadcasti32x4 %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 5
+	vbroadcasti32x4 %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 6
+	vbroadcasti32x4 %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 7
+	vbroadcasti32x4 %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 8
+	vbroadcasti32x4 %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 9
+	vbroadcasti32x4 %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 10
+	vbroadcasti32x4 %%T0, [keys + 16*10]
+	vaesenclast  %%ST1, %%T0
+	vaesenclast  %%ST2, %%T0
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+
+	; load next Tweak values
+	vmovdqa32  %%TW1, zmm15
+	vmovdqa32  %%TW2, zmm16
+%endmacro
+
+
+; Encrypt 16 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_16_zmm 10
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+
+%define %%TW1   %5      ; tweak 1
+%define %%TW2   %6      ; tweak 2
+%define %%TW3   %7      ; tweak 3
+%define %%TW4   %8      ; tweak 4
+
+%define %%T0    %9     ; Temp register
+%define %%last_eight     %10
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+	vpxorq    %%ST3, %%TW3
+	vpxorq    %%ST4, %%TW4
+
+	; ARK
+	vbroadcasti32x4 %%T0, [keys]
+	vpxorq    %%ST1, %%T0
+	vpxorq    %%ST2, %%T0
+	vpxorq    %%ST3, %%T0
+	vpxorq    %%ST4, %%T0
+
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW3, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm15, %%TW3, 1
+		vpxord		zmm15, zmm15, zmm14
+%endif
+	; round 1
+	vbroadcasti32x4 %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 2
+	vbroadcasti32x4 %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 3
+	vbroadcasti32x4 %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW4, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm16, %%TW4, 1
+		vpxord		zmm16, zmm16, zmm14
+%endif
+	; round 4
+	vbroadcasti32x4 %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 5
+	vbroadcasti32x4 %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 6
+	vbroadcasti32x4 %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, zmm15, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm17, zmm15, 1
+		vpxord		zmm17, zmm17, zmm14
+%endif
+	; round 7
+	vbroadcasti32x4 %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 8
+	vbroadcasti32x4 %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 9
+	vbroadcasti32x4 %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, zmm16, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm18, zmm16, 1
+		vpxord		zmm18, zmm18, zmm14
+%endif
+	; round 10
+	vbroadcasti32x4 %%T0, [keys + 16*10]
+	vaesenclast  %%ST1, %%T0
+	vaesenclast  %%ST2, %%T0
+	vaesenclast  %%ST3, %%T0
+	vaesenclast  %%ST4, %%T0
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+	vpxorq    %%ST3, %%TW3
+	vpxorq    %%ST4, %%TW4
+
+	; load next Tweak values
+	vmovdqa32  %%TW1, zmm15
+	vmovdqa32  %%TW2, zmm16
+	vmovdqa32  %%TW3, zmm17
+	vmovdqa32  %%TW4, zmm18
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_enc_expanded_key_vaes, function
+XTS_AES_128_enc_expanded_key_vaes:
+	endbranch
+
+%define ALIGN_STACK
+%ifdef ALIGN_STACK
+	push		rbp
+	mov		rbp, rsp
+	sub		rsp, VARIABLE_OFFSET
+	and		rsp, ~63
+%else
+	sub		rsp, VARIABLE_OFFSET
+%endif
+
+	mov		[_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov		[_gpr + 8*1], rdi
+	mov		[_gpr + 8*2], rsi
+
+	vmovdqa		[_xmm + 16*0], xmm6
+	vmovdqa		[_xmm + 16*1], xmm7
+	vmovdqa		[_xmm + 16*2], xmm8
+	vmovdqa		[_xmm + 16*3], xmm9
+	vmovdqa		[_xmm + 16*4], xmm10
+	vmovdqa		[_xmm + 16*5], xmm11
+	vmovdqa		[_xmm + 16*6], xmm12
+	vmovdqa		[_xmm + 16*7], xmm13
+	vmovdqa		[_xmm + 16*8], xmm14
+	vmovdqa		[_xmm + 16*9], xmm15
+%endif
+
+	mov		ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	vmovdqu		xmm1, [T_val]                   ; read initial Tweak value
+	vpxor		xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]	; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]	; ciphertext pointer
+%endif
+
+	cmp		N_val, 128
+	jl              _less_than_128_bytes
+
+	vpbroadcastq	zpoly, ghash_poly_8b
+
+	cmp		N_val, 256
+	jge		_start_by16
+
+	cmp		N_val, 128
+	jge		_start_by8
+
+_do_n_blocks:
+	cmp		N_val, 0
+	je		_ret_
+
+	cmp		N_val, (7*16)
+	jge		_remaining_num_blocks_is_7
+
+	cmp		N_val, (6*16)
+	jge		_remaining_num_blocks_is_6
+
+	cmp		N_val, (5*16)
+	jge		_remaining_num_blocks_is_5
+
+	cmp		N_val, (4*16)
+	jge		_remaining_num_blocks_is_4
+
+	cmp		N_val, (3*16)
+	jge		_remaining_num_blocks_is_3
+
+	cmp		N_val, (2*16)
+	jge		_remaining_num_blocks_is_2
+
+	cmp		N_val, (1*16)
+	jge		_remaining_num_blocks_is_1
+
+;; _remaining_num_blocks_is_0:
+	vmovdqa		xmm8, xmm0
+	vmovdqa		xmm0, xmm9
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_7:
+	mov		tmp1, -1
+	shr		tmp1, 16
+	kmovq		k1, tmp1
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2 {k1}, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*7
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4] {k1}, zmm2
+	add		ptr_ciphertext, 16*7
+
+	vextracti32x4	xmm8, zmm2, 0x2
+	vextracti32x4	xmm0, zmm10, 0x3
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_6:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	ymm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*6
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], ymm2
+	add		ptr_ciphertext, 16*6
+
+	vextracti32x4	xmm8, zmm2, 0x1
+	vextracti32x4	xmm0, zmm10, 0x2
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_5:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*5
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu		[ptr_ciphertext+16*4], xmm2
+	add		ptr_ciphertext, 16*5
+
+	movdqa		xmm8, xmm2
+	vextracti32x4	xmm0, zmm10, 0x1
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_4:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	add		ptr_plaintext, 16*4
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	add		ptr_ciphertext, 16*4
+
+	vextracti32x4	xmm8, zmm1, 0x3
+	vextracti32x4	xmm0, zmm10, 0x0
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_3:
+	vextracti32x4	xmm10, zmm9, 1
+	vextracti32x4	xmm11, zmm9, 2
+	vmovdqu		xmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*1]
+	vmovdqu		xmm3, [ptr_plaintext+16*2]
+	add		ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	add		ptr_ciphertext, 16*3
+
+	vmovdqa		xmm8, xmm3
+	vextracti32x4	xmm0, zmm9, 3
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_2:
+	vextracti32x4	xmm10, zmm9, 1
+	vmovdqu		xmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*1]
+	add		ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	add		ptr_ciphertext, 16*2
+
+	vmovdqa		xmm8, xmm2
+	vextracti32x4	xmm0, zmm9, 2
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_1:
+	vmovdqu		xmm1, [ptr_plaintext]
+	add		ptr_plaintext, 16
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16
+
+	vmovdqa		xmm8, xmm1
+	vextracti32x4	xmm0, zmm9, 1
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+
+_start_by16:
+	; Make first 7 tweek values
+	vbroadcasti32x4	zmm0, [TW]
+	vbroadcasti32x4	zmm8, [shufb_15_7]
+	mov		tmp1, 0xaa
+	kmovq		k2, tmp1
+
+	; Mult tweak by 2^{3, 2, 1, 0}
+	vpshufb		zmm1, zmm0, zmm8		; mov 15->0, 7->8
+	vpsllvq		zmm4, zmm0, [const_dq3210]	; shift l 3,2,1,0
+	vpsrlvq		zmm2, zmm1, [const_dq5678]	; shift r 5,6,7,8
+	vpclmulqdq      zmm3, zmm2, zpoly, 0x00
+	vpxorq		zmm4 {k2}, zmm4, zmm2		; tweaks shifted by 3-0
+	vpxord		zmm9, zmm3, zmm4
+
+	; Mult tweak by 2^{7, 6, 5, 4}
+	vpsllvq		zmm5, zmm0, [const_dq7654]	; shift l 7,6,5,4
+	vpsrlvq		zmm6, zmm1, [const_dq1234]	; shift r 1,2,3,4
+	vpclmulqdq      zmm7, zmm6, zpoly, 0x00
+	vpxorq		zmm5 {k2}, zmm5, zmm6		; tweaks shifted by 7-4
+	vpxord		zmm10, zmm7, zmm5
+
+	; Make next 8 tweek values by all x 2^8
+	vpsrldq		zmm13, zmm9, 15
+	vpclmulqdq	zmm14, zmm13, zpoly, 0
+	vpslldq		zmm11, zmm9, 1
+	vpxord		zmm11, zmm11, zmm14
+
+	vpsrldq		zmm15, zmm10, 15
+	vpclmulqdq	zmm16, zmm15, zpoly, 0
+	vpslldq		zmm12, zmm10, 1
+	vpxord		zmm12, zmm12, zmm16
+
+_main_loop_run_16:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2, [ptr_plaintext+16*4]
+	vmovdqu8	zmm3, [ptr_plaintext+16*8]
+	vmovdqu8	zmm4, [ptr_plaintext+16*12]
+	add		ptr_plaintext, 256
+
+	encrypt_by_16_zmm  zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
+
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], zmm2
+	vmovdqu8	[ptr_ciphertext+16*8], zmm3
+	vmovdqu8	[ptr_ciphertext+16*12], zmm4
+	add		ptr_ciphertext, 256
+	sub		N_val, 256
+
+	cmp		N_val, 256
+	jge		_main_loop_run_16
+
+	cmp		N_val, 128
+	jge		_main_loop_run_8
+
+	vextracti32x4	xmm0, zmm4, 0x3 ; keep last crypted block
+	jmp		_do_n_blocks
+
+_start_by8:
+	; Make first 7 tweek values
+	vbroadcasti32x4	zmm0, [TW]
+	vbroadcasti32x4	zmm8, [shufb_15_7]
+	mov		tmp1, 0xaa
+	kmovq		k2, tmp1
+
+	; Mult tweak by 2^{3, 2, 1, 0}
+	vpshufb		zmm1, zmm0, zmm8		; mov 15->0, 7->8
+	vpsllvq		zmm4, zmm0, [const_dq3210]	; shift l 3,2,1,0
+	vpsrlvq		zmm2, zmm1, [const_dq5678]	; shift r 5,6,7,8
+	vpclmulqdq      zmm3, zmm2, zpoly, 0x00
+	vpxorq		zmm4 {k2}, zmm4, zmm2		; tweaks shifted by 3-0
+	vpxord		zmm9, zmm3, zmm4
+
+	; Mult tweak by 2^{7, 6, 5, 4}
+	vpsllvq		zmm5, zmm0, [const_dq7654]	; shift l 7,6,5,4
+	vpsrlvq		zmm6, zmm1, [const_dq1234]	; shift r 1,2,3,4
+	vpclmulqdq      zmm7, zmm6, zpoly, 0x00
+	vpxorq		zmm5 {k2}, zmm5, zmm6		; tweaks shifted by 7-4
+	vpxord		zmm10, zmm7, zmm5
+
+_main_loop_run_8:
+	; load plaintext
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 128
+
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 0
+
+	; store ciphertext
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], zmm2
+	add		ptr_ciphertext, 128
+	sub		N_val, 128
+
+	cmp		N_val, 128
+	jge		_main_loop_run_8
+
+	vextracti32x4	xmm0, zmm2, 0x3 ; keep last crypted block
+	jmp		_do_n_blocks
+
+_steal_cipher_next:
+	; generate next Tweak value
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW], twtempl
+	mov		[TW + 8], twtemph
+	vmovdqa		xmm0, [TW]
+
+_steal_cipher:
+	; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
+	vmovdqa		xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea		twtempl, [vpshufb_shf_table]
+	vmovdqu		xmm10, [twtempl+N_val]
+	vpshufb		xmm8, xmm10
+
+	vmovdqu		xmm3, [ptr_plaintext - 16 + N_val]
+	vmovdqu		[ptr_ciphertext - 16 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea		twtempl, [vpshufb_shf_table +16]
+	sub		twtempl, N_val
+	vmovdqu		xmm10, [twtempl]
+	vpxor		xmm10, [mask1]
+	vpshufb		xmm3, xmm10
+
+	vpblendvb	xmm3, xmm3, xmm2, xmm10
+
+	; xor Tweak value
+	vpxor		xmm8, xmm3, xmm0
+
+	;encrypt last block with cipher stealing
+	vpxor		xmm8, [keys]		; ARK
+	vaesenc		xmm8, [keys + 16*1]	; round 1
+	vaesenc		xmm8, [keys + 16*2]	; round 2
+	vaesenc		xmm8, [keys + 16*3]	; round 3
+	vaesenc		xmm8, [keys + 16*4]	; round 4
+	vaesenc		xmm8, [keys + 16*5]	; round 5
+	vaesenc		xmm8, [keys + 16*6]	; round 6
+	vaesenc		xmm8, [keys + 16*7]	; round 7
+	vaesenc		xmm8, [keys + 16*8]	; round 8
+	vaesenc		xmm8, [keys + 16*9]	; round 9
+	vaesenclast	xmm8, [keys + 16*10]	; round 10
+
+	; xor Tweak value
+	vpxor		xmm8, xmm8, xmm0
+
+	; store last ciphertext value
+	vmovdqu		[ptr_ciphertext - 16], xmm8
+
+_ret_:
+	mov		rbx, [_gpr + 8*0]
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov		rdi, [_gpr + 8*1]
+	mov		rsi, [_gpr + 8*2]
+
+	vmovdqa		xmm6, [_xmm + 16*0]
+	vmovdqa		xmm7, [_xmm + 16*1]
+	vmovdqa		xmm8, [_xmm + 16*2]
+	vmovdqa		xmm9, [_xmm + 16*3]
+	vmovdqa		xmm10, [_xmm + 16*4]
+	vmovdqa		xmm11, [_xmm + 16*5]
+	vmovdqa		xmm12, [_xmm + 16*6]
+	vmovdqa		xmm13, [_xmm + 16*7]
+	vmovdqa		xmm14, [_xmm + 16*8]
+	vmovdqa		xmm15, [_xmm + 16*9]
+%endif
+
+%ifndef ALIGN_STACK
+	add		rsp, VARIABLE_OFFSET
+%else
+	mov		rsp, rbp
+	pop		rbp
+%endif
+	ret
+
+
+_less_than_128_bytes:
+	cmp		N_val, 16
+	jb		_ret_
+
+	mov		tmp1, N_val
+	and		tmp1, (7 << 4)
+	cmp		tmp1, (6 << 4)
+	je		_num_blocks_is_6
+	cmp		tmp1, (5 << 4)
+	je		_num_blocks_is_5
+	cmp		tmp1, (4 << 4)
+	je		_num_blocks_is_4
+	cmp		tmp1, (3 << 4)
+	je		_num_blocks_is_3
+	cmp		tmp1, (2 << 4)
+	je		_num_blocks_is_2
+	cmp		tmp1, (1 << 4)
+	je		_num_blocks_is_1
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add		ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+	vmovdqu		[ptr_ciphertext+16*5], xmm6
+	vmovdqu		[ptr_ciphertext+16*6], xmm7
+	add		ptr_ciphertext, 16*7
+	vmovdqa		xmm8, xmm7
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add		ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+	vmovdqu		[ptr_ciphertext+16*5], xmm6
+
+	add		ptr_ciphertext, 16*6
+	vmovdqa		xmm8, xmm6
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add		ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+
+	add		ptr_ciphertext, 16*5
+	vmovdqa		xmm8, xmm5
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add		ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+
+	add		ptr_ciphertext, 16*4
+	vmovdqa		xmm8, xmm4
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add		ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+
+	add		ptr_ciphertext, 16*3
+	vmovdqa		xmm8, xmm3
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add		ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext], xmm1
+	vmovdqu		[ptr_ciphertext+16], xmm2
+
+	add		ptr_ciphertext, 16*2
+	vmovdqa		xmm8, xmm2
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add		ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16*1
+	vmovdqa		xmm8, xmm1
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
+const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
+const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
+const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
+
+shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+%else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_XTS_AES_128_enc_expanded_key_vaes
+no_XTS_AES_128_enc_expanded_key_vaes:
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm
new file mode 100644
index 000000000..cbb98cc38
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm
@@ -0,0 +1,1530 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*19     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*19     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*29     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_sse(
+;               UINT8 *k2,      // key used for tweaking, 16*1 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*1 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *pt,        // plaintext sector input data
+;               UINT8 *ct);     // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define target_ptr_val          rsi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define target_ptr_val          rdx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of aeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro	key_expansion_128	3
+%define	%%xraw_key	%1
+%define	%%xtmp	%2
+%define	%%xround_key	%3
+	pshufd	%%xraw_key,  %%xraw_key, 11111111b
+	shufps	%%xtmp, %%xround_key, 00010000b
+	pxor	%%xround_key, %%xtmp
+	shufps	%%xtmp, %%xround_key, 10001100b
+	pxor	%%xround_key, %%xtmp
+	pxor	%%xround_key,  %%xraw_key
+%endmacro
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 8
+%define	%%xkey2	%1
+%define	%%xstate_tweak	%2
+%define	%%xkey1	%3
+%define	%%xraw_key	%4
+%define	%%xtmp	%5
+%define	%%ptr_key2	%6
+%define	%%ptr_key1	%7
+%define	%%ptr_expanded_keys	%8
+
+
+	movdqu	%%xkey2, [%%ptr_key2]
+	movdqu	%%xkey1, [%%ptr_key1]
+	movdqa	[%%ptr_expanded_keys+16*0], %%xkey1
+
+	pxor	%%xstate_tweak, %%xkey2				; ARK for tweak encryption
+
+	aeskeygenassist		%%xraw_key, %%xkey2, 0x1	; Generating round key 1 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist		%%xraw_key, %%xkey1, 0x1	; Generating round key 1 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	aesenc			%%xstate_tweak, %%xkey2		; round 1 for tweak encryption
+	movdqa			[%%ptr_expanded_keys+16*1], %%xkey1
+
+	aeskeygenassist		%%xraw_key, %%xkey2, 0x2	; Generating round key 2 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist		%%xraw_key, %%xkey1, 0x2	; Generating round key 2 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	aesenc			%%xstate_tweak, %%xkey2		; round 2 for tweak encryption
+	movdqa			[%%ptr_expanded_keys+16*2], %%xkey1
+
+	aeskeygenassist		%%xraw_key, %%xkey2, 0x4	; Generating round key 3 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist		%%xraw_key, %%xkey1, 0x4	; Generating round key 3 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	aesenc			%%xstate_tweak, %%xkey2		; round 3 for tweak encryption
+	movdqa			[%%ptr_expanded_keys + 16*3], %%xkey1
+
+	aeskeygenassist		%%xraw_key, %%xkey2, 0x8	; Generating round key 4 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist		%%xraw_key, %%xkey1, 0x8	; Generating round key 4 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	aesenc			%%xstate_tweak, %%xkey2		; round 4 for tweak encryption
+	movdqa			[%%ptr_expanded_keys + 16*4], %%xkey1
+
+	aeskeygenassist		%%xraw_key, %%xkey2, 0x10	; Generating round key 5 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist		%%xraw_key, %%xkey1, 0x10	; Generating round key 5 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	aesenc			%%xstate_tweak, %%xkey2		; round 5 for tweak encryption
+	movdqa			[%%ptr_expanded_keys + 16*5], %%xkey1
+
+	aeskeygenassist		%%xraw_key, %%xkey2, 0x20	; Generating round key 6 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist		%%xraw_key, %%xkey1, 0x20	; Generating round key 6 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	aesenc			%%xstate_tweak, %%xkey2		; round 6 for tweak encryption
+	movdqa			[%%ptr_expanded_keys + 16*6], %%xkey1
+
+	aeskeygenassist		%%xraw_key, %%xkey2, 0x40	; Generating round key 7 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist		%%xraw_key, %%xkey1, 0x40	; Generating round key 7 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	aesenc			%%xstate_tweak, %%xkey2		; round 7 for tweak encryption
+	movdqa			[%%ptr_expanded_keys + 16*7], %%xkey1
+
+	aeskeygenassist		%%xraw_key, %%xkey2, 0x80	; Generating round key 8 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist		%%xraw_key, %%xkey1, 0x80	; Generating round key 8 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	aesenc			%%xstate_tweak, %%xkey2		; round 8 for tweak encryption
+	movdqa			[%%ptr_expanded_keys + 16*8], %%xkey1
+
+	aeskeygenassist		%%xraw_key, %%xkey2, 0x1b	; Generating round key 9 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist		%%xraw_key, %%xkey1, 0x1b	; Generating round key 9 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	aesenc			%%xstate_tweak, %%xkey2		; round 9 for tweak encryption
+	movdqa			[%%ptr_expanded_keys + 16*9], %%xkey1
+
+	aeskeygenassist		%%xraw_key, %%xkey2, 0x36	; Generating round key 10 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist		%%xraw_key, %%xkey1, 0x36	; Generating round key 10 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	aesenclast		%%xstate_tweak, %%xkey2		; round 10 for tweak encryption
+	movdqa			[%%ptr_expanded_keys + 16*10], %%xkey1
+
+	movdqa	[TW], %%xstate_tweak		; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		movdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		movdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		movdqa  %%TW2, [TW+16*1]
+		movdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		movdqa  %%TW3, [TW+16*2]
+		movdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		movdqa  %%TW4, [TW+16*3]
+		movdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		movdqa  %%TW5, [TW+16*4]
+		movdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		movdqa  %%TW6, [TW+16*5]
+		movdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		movdqa  %%TW7, [TW+16*6]
+		movdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro  encrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	pxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	movdqa  %%T0, [keys]
+	pxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	movdqa  %%T0, [keys + 16*1]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	movdqa  %%T0, [keys + 16*2]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	movdqa  %%T0, [keys + 16*3]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	movdqa  %%T0, [keys + 16*4]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	movdqa  %%T0, [keys + 16*5]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	movdqa  %%T0, [keys + 16*6]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	movdqa  %%T0, [keys + 16*7]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	movdqa  %%T0, [keys + 16*8]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	movdqa  %%T0, [keys + 16*9]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+
+
+
+
+	; round 10
+	movdqa  %%T0, [keys + 16*10]
+	aesenclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		movdqa  %%TW1, [TW + 16*0]
+		movdqa  %%TW2, [TW + 16*1]
+		movdqa  %%TW3, [TW + 16*2]
+		movdqa  %%TW4, [TW + 16*3]
+		movdqa  %%TW5, [TW + 16*4]
+		movdqa  %%TW6, [TW + 16*5]
+		movdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_eight 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%TW8   %16     ; tweak 8
+%define %%T0    %17     ; Temp register
+%define %%last_eight     %18
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+	pxor    %%ST2, %%TW2
+	pxor    %%ST3, %%TW3
+	pxor    %%ST4, %%TW4
+	pxor    %%ST5, %%TW5
+	pxor    %%ST6, %%TW6
+	pxor    %%ST7, %%TW7
+	pxor    %%ST8, %%TW8
+
+	; ARK
+	movdqa %%T0, [keys]
+	pxor    %%ST1, %%T0
+	pxor    %%ST2, %%T0
+	pxor    %%ST3, %%T0
+	pxor    %%ST4, %%T0
+	pxor    %%ST5, %%T0
+	pxor    %%ST6, %%T0
+	pxor    %%ST7, %%T0
+	pxor    %%ST8, %%T0
+
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 1
+	movdqa %%T0, [keys + 16*1]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 2
+	movdqa %%T0, [keys + 16*2]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+
+%endif
+	; round 3
+	movdqa %%T0, [keys + 16*3]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*2], twtempl
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 4
+	movdqa %%T0, [keys + 16*4]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl
+%endif
+	; round 5
+	movdqa %%T0, [keys + 16*5]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 6
+	movdqa %%T0, [keys + 16*6]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl
+		mov     [TW + 8*7], twtemph
+%endif
+	; round 7
+	movdqa %%T0, [keys + 16*7]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 8
+	movdqa %%T0, [keys + 16*8]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl
+		mov     [TW + 8*9], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 9
+	movdqa %%T0, [keys + 16*9]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+		mov     [TW + 8*10], twtempl
+		mov     [TW + 8*11], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+		mov     [TW + 8*13], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+;               mov     [TW + 8*14], twtempl
+;               mov     [TW + 8*15], twtemph
+%endif
+	; round 10
+	movdqa %%T0, [keys + 16*10]
+	aesenclast      %%ST1, %%T0
+	aesenclast      %%ST2, %%T0
+	aesenclast      %%ST3, %%T0
+	aesenclast      %%ST4, %%T0
+	aesenclast      %%ST5, %%T0
+	aesenclast      %%ST6, %%T0
+	aesenclast      %%ST7, %%T0
+	aesenclast      %%ST8, %%T0
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+	pxor    %%ST2, %%TW2
+	pxor    %%ST3, %%TW3
+	pxor    %%ST4, %%TW4
+	pxor    %%ST5, %%TW5
+	pxor    %%ST6, %%TW6
+	pxor    %%ST7, %%TW7
+	pxor    %%ST8, %%TW8
+
+		mov     [TW + 8*14], twtempl
+		mov     [TW + 8*15], twtemph
+		; load next Tweak values
+		movdqa  %%TW1, [TW + 16*0]
+		movdqa  %%TW2, [TW + 16*1]
+		movdqa  %%TW3, [TW + 16*2]
+		movdqa  %%TW4, [TW + 16*3]
+		movdqa  %%TW5, [TW + 16*4]
+		movdqa  %%TW6, [TW + 16*5]
+		movdqa  %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_enc_sse, function
+XTS_AES_128_enc_sse:
+	endbranch
+
+	sub     rsp, VARIABLE_OFFSET
+
+	mov     [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [_gpr + 8*1], rdi
+	mov     [_gpr + 8*2], rsi
+
+	movdqa  [_xmm + 16*0], xmm6
+	movdqa  [_xmm + 16*1], xmm7
+	movdqa  [_xmm + 16*2], xmm8
+	movdqa  [_xmm + 16*3], xmm9
+	movdqa  [_xmm + 16*4], xmm10
+	movdqa  [_xmm + 16*5], xmm11
+	movdqa  [_xmm + 16*6], xmm12
+	movdqa  [_xmm + 16*7], xmm13
+	movdqa  [_xmm + 16*8], xmm14
+	movdqa  [_xmm + 16*9], xmm15
+%endif
+
+	mov     ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	movdqu  xmm1, [T_val]                   ; read initial Tweak value
+	pxor    xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]            ; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]           ; ciphertext pointer
+%endif
+
+
+
+	mov             target_ptr_val, N_val
+	and             target_ptr_val, -16             ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+	sub             target_ptr_val, 128             ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+	jl              _less_than_128_bytes
+
+	add             target_ptr_val, ptr_ciphertext
+
+
+	mov             tmp1, N_val
+	and             tmp1, (7 << 4)
+	jz              _initial_num_blocks_is_0
+
+	cmp             tmp1, (4 << 4)
+	je              _initial_num_blocks_is_4
+
+
+
+	cmp             tmp1, (6 << 4)
+	je              _initial_num_blocks_is_6
+
+	cmp             tmp1, (5 << 4)
+	je              _initial_num_blocks_is_5
+
+
+
+	cmp             tmp1, (3 << 4)
+	je              _initial_num_blocks_is_3
+
+	cmp             tmp1, (2 << 4)
+	je              _initial_num_blocks_is_2
+
+	cmp             tmp1, (1 << 4)
+	je              _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	add     ptr_ciphertext, 16*7
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	add     ptr_ciphertext, 16*6
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	add     ptr_ciphertext, 16*5
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	add     ptr_ciphertext, 16*4
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+
+_initial_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	add     ptr_ciphertext, 16*3
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+	movdqu  [ptr_ciphertext+16], xmm2
+	add     ptr_ciphertext, 16*2
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+	add     ptr_ciphertext, 16
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_0:
+		mov             twtempl, [TW+8*0]
+		mov             twtemph, [TW+8*1]
+		movdqa          xmm9, [TW+16*0]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*2], twtempl
+		mov             [TW+8*3], twtemph
+		movdqa          xmm10, [TW+16*1]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*4], twtempl
+		mov             [TW+8*5], twtemph
+		movdqa          xmm11, [TW+16*2]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*6], twtempl
+		mov             [TW+8*7], twtemph
+		movdqa          xmm12, [TW+16*3]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*8], twtempl
+		mov             [TW+8*9], twtemph
+		movdqa          xmm13, [TW+16*4]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*10], twtempl
+		mov             [TW+8*11], twtemph
+		movdqa          xmm14, [TW+16*5]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*12], twtempl
+		mov             [TW+8*13], twtemph
+		movdqa          xmm15, [TW+16*6]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*14], twtempl
+		mov             [TW+8*15], twtemph
+		;movdqa         xmm16, [TW+16*7]
+
+		cmp             ptr_ciphertext, target_ptr_val
+		je              _last_eight
+_main_loop:
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+
+	add     ptr_plaintext, 128
+
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	movdqu  [ptr_ciphertext+16*7], xmm8
+	add     ptr_ciphertext, 128
+
+	cmp     ptr_ciphertext, target_ptr_val
+	jne     _main_loop
+
+_last_eight:
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+_steal_cipher:
+	; start cipher stealing
+
+	; generate next Tweak value
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW], twtempl
+	mov     [TW + 8], twtemph
+
+	movdqa  xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea     twtempl, [pshufb_shf_table]
+	movdqu  xmm0, [twtempl+N_val]
+	pshufb  xmm8, xmm0
+
+
+	movdqu  xmm3, [ptr_plaintext + 112 + N_val]      ; state register is temporarily xmm3 to eliminate a move
+	movdqu  [ptr_ciphertext + 112 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea     twtempl, [pshufb_shf_table +16]
+	sub     twtempl, N_val
+	movdqu  xmm0, [twtempl]
+	pxor    xmm0, [mask1]
+	pshufb  xmm3, xmm0
+
+	pblendvb        xmm3, xmm2      ;xmm0 is implicit
+
+	; xor Tweak value
+	movdqa  xmm8, [TW]
+	pxor    xmm8, xmm3      ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+	;encrypt last block with cipher stealing
+	pxor    xmm8, [keys]                    ; ARK
+	aesenc  xmm8, [keys + 16*1]             ; round 1
+	aesenc  xmm8, [keys + 16*2]             ; round 2
+	aesenc  xmm8, [keys + 16*3]             ; round 3
+	aesenc  xmm8, [keys + 16*4]             ; round 4
+	aesenc  xmm8, [keys + 16*5]             ; round 5
+	aesenc  xmm8, [keys + 16*6]             ; round 6
+	aesenc  xmm8, [keys + 16*7]             ; round 7
+	aesenc  xmm8, [keys + 16*8]             ; round 8
+	aesenc  xmm8, [keys + 16*9]             ; round 9
+	aesenclast      xmm8, [keys + 16*10]    ; round 10
+
+	; xor Tweak value
+	pxor    xmm8, [TW]
+
+_done:
+	; store last ciphertext value
+	movdqu  [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+	mov     rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rdi, [_gpr + 8*1]
+	mov     rsi, [_gpr + 8*2]
+
+
+	movdqa  xmm6, [_xmm + 16*0]
+	movdqa  xmm7, [_xmm + 16*1]
+	movdqa  xmm8, [_xmm + 16*2]
+	movdqa  xmm9, [_xmm + 16*3]
+	movdqa  xmm10, [_xmm + 16*4]
+	movdqa  xmm11, [_xmm + 16*5]
+	movdqa  xmm12, [_xmm + 16*6]
+	movdqa  xmm13, [_xmm + 16*7]
+	movdqa  xmm14, [_xmm + 16*8]
+	movdqa  xmm15, [_xmm + 16*9]
+%endif
+
+	add     rsp, VARIABLE_OFFSET
+
+	ret
+
+
+
+
+
+_less_than_128_bytes:
+	cmp N_val, 16
+	jb _ret_
+
+	mov     tmp1, N_val
+	and     tmp1, (7 << 4)
+	cmp     tmp1, (6 << 4)
+	je      _num_blocks_is_6
+	cmp     tmp1, (5 << 4)
+	je      _num_blocks_is_5
+	cmp     tmp1, (4 << 4)
+	je      _num_blocks_is_4
+	cmp     tmp1, (3 << 4)
+	je      _num_blocks_is_3
+	cmp     tmp1, (2 << 4)
+	je      _num_blocks_is_2
+	cmp     tmp1, (1 << 4)
+	je      _num_blocks_is_1
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	sub     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	movdqa  xmm8, xmm7
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	sub     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	movdqa  xmm8, xmm6
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	sub     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	movdqa  xmm8, xmm5
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	sub     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	movdqa  xmm8, xmm4
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	sub     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	movdqa  xmm8, xmm3
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	sub     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	movdqa  xmm8, xmm2
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+	sub     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	movdqa  xmm8, xmm1
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_vaes.asm
new file mode 100644
index 000000000..3532ddda5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_vaes.asm
@@ -0,0 +1,1498 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*23     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*23     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*33     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_vavx(
+;               UINT8 *k2,      // key used for tweaking, 16*2 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*2 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *pt,        // plaintext sector input data
+;               UINT8 *ct);     // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+%define zpoly   zmm25
+
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro	key_expansion_128	3
+%define	%%xraw_key	%1
+%define	%%xtmp	%2
+%define	%%xround_key	%3
+	vpshufd	%%xraw_key,  %%xraw_key, 11111111b
+	vshufps	%%xtmp, %%xround_key, 00010000b
+	vpxor	%%xround_key, %%xtmp
+	vshufps	%%xtmp, %%xround_key, 10001100b
+	vpxor	%%xround_key, %%xtmp
+	vpxor	%%xround_key,  %%xraw_key
+%endmacro
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 8
+%define	%%xkey2	%1
+%define	%%xstate_tweak	%2
+%define	%%xkey1	%3
+%define	%%xraw_key	%4
+%define	%%xtmp	%5
+%define	%%ptr_key2	%6
+%define	%%ptr_key1	%7
+%define	%%ptr_expanded_keys	%8
+
+
+	vmovdqu	%%xkey2, [%%ptr_key2]
+	vmovdqu	%%xkey1, [%%ptr_key1]
+	vmovdqa	[%%ptr_expanded_keys+16*0], %%xkey1
+
+	vpxor	%%xstate_tweak, %%xkey2				; ARK for tweak encryption
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x1	; Generating round key 1 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x1	; Generating round key 1 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenc			%%xstate_tweak, %%xkey2		; round 1 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys+16*1], %%xkey1
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x2	; Generating round key 2 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x2	; Generating round key 2 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenc			%%xstate_tweak, %%xkey2		; round 2 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys+16*2], %%xkey1
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x4	; Generating round key 3 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x4	; Generating round key 3 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenc			%%xstate_tweak, %%xkey2		; round 3 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys + 16*3], %%xkey1
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x8	; Generating round key 4 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x8	; Generating round key 4 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenc			%%xstate_tweak, %%xkey2		; round 4 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys + 16*4], %%xkey1
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x10	; Generating round key 5 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x10	; Generating round key 5 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenc			%%xstate_tweak, %%xkey2		; round 5 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys + 16*5], %%xkey1
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x20	; Generating round key 6 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x20	; Generating round key 6 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenc			%%xstate_tweak, %%xkey2		; round 6 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys + 16*6], %%xkey1
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x40	; Generating round key 7 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x40	; Generating round key 7 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenc			%%xstate_tweak, %%xkey2		; round 7 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys + 16*7], %%xkey1
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x80	; Generating round key 8 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x80	; Generating round key 8 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenc			%%xstate_tweak, %%xkey2		; round 8 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys + 16*8], %%xkey1
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x1b	; Generating round key 9 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x1b	; Generating round key 9 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenc			%%xstate_tweak, %%xkey2		; round 9 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys + 16*9], %%xkey1
+
+	vaeskeygenassist		%%xraw_key, %%xkey2, 0x36	; Generating round key 10 for key2
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist		%%xraw_key, %%xkey1, 0x36	; Generating round key 10 for key1
+	key_expansion_128	%%xraw_key, %%xtmp, %%xkey1
+	vaesenclast		%%xstate_tweak, %%xkey2		; round 10 for tweak encryption
+	vmovdqa			[%%ptr_expanded_keys + 16*10], %%xkey1
+
+	vmovdqa	[TW], %%xstate_tweak		; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		vmovdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		vmovdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		vmovdqa  %%TW2, [TW+16*1]
+		vmovdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		vmovdqa  %%TW3, [TW+16*2]
+		vmovdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		vmovdqa  %%TW4, [TW+16*3]
+		vmovdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		vmovdqa  %%TW5, [TW+16*4]
+		vmovdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		vmovdqa  %%TW6, [TW+16*5]
+		vmovdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		vmovdqa  %%TW7, [TW+16*6]
+		vmovdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro  encrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	vmovdqa  %%T0, [keys]
+	vpxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	vmovdqa  %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	vmovdqa  %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	vmovdqa  %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	vmovdqa  %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	vmovdqa  %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	vmovdqa  %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	vmovdqa  %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	vmovdqa  %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	vmovdqa  %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+
+
+	; round 10
+	vmovdqa  %%T0, [keys + 16*10]
+	vaesenclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenclast      %%ST7, %%T0
+%endif
+
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_eight_zmm 6
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%TW1   %3      ; tweak 1
+%define %%TW2   %4      ; tweak 2
+%define %%T0    %5     ; Temp register
+%define %%last_eight     %6
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+
+	; ARK
+	vbroadcasti32x4 %%T0, [keys]
+	vpxorq    %%ST1, %%T0
+	vpxorq    %%ST2, %%T0
+
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW1, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm15, %%TW1, 1
+		vpxord		zmm15, zmm15, zmm14
+%endif
+	; round 1
+	vbroadcasti32x4 %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 2
+	vbroadcasti32x4 %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 3
+	vbroadcasti32x4 %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW2, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm16, %%TW2, 1
+		vpxord		zmm16, zmm16, zmm14
+%endif
+	; round 4
+	vbroadcasti32x4 %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 5
+	vbroadcasti32x4 %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 6
+	vbroadcasti32x4 %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 7
+	vbroadcasti32x4 %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 8
+	vbroadcasti32x4 %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 9
+	vbroadcasti32x4 %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 10
+	vbroadcasti32x4 %%T0, [keys + 16*10]
+	vaesenclast  %%ST1, %%T0
+	vaesenclast  %%ST2, %%T0
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+
+	; load next Tweak values
+	vmovdqa32  %%TW1, zmm15
+	vmovdqa32  %%TW2, zmm16
+%endmacro
+
+
+; Encrypt 16 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_16_zmm 10
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+
+%define %%TW1   %5      ; tweak 1
+%define %%TW2   %6      ; tweak 2
+%define %%TW3   %7      ; tweak 3
+%define %%TW4   %8      ; tweak 4
+
+%define %%T0    %9     ; Temp register
+%define %%last_eight     %10
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+	vpxorq    %%ST3, %%TW3
+	vpxorq    %%ST4, %%TW4
+
+	; ARK
+	vbroadcasti32x4 %%T0, [keys]
+	vpxorq    %%ST1, %%T0
+	vpxorq    %%ST2, %%T0
+	vpxorq    %%ST3, %%T0
+	vpxorq    %%ST4, %%T0
+
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW3, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm15, %%TW3, 1
+		vpxord		zmm15, zmm15, zmm14
+%endif
+	; round 1
+	vbroadcasti32x4 %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 2
+	vbroadcasti32x4 %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 3
+	vbroadcasti32x4 %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW4, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm16, %%TW4, 1
+		vpxord		zmm16, zmm16, zmm14
+%endif
+	; round 4
+	vbroadcasti32x4 %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 5
+	vbroadcasti32x4 %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 6
+	vbroadcasti32x4 %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, zmm15, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm17, zmm15, 1
+		vpxord		zmm17, zmm17, zmm14
+%endif
+	; round 7
+	vbroadcasti32x4 %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 8
+	vbroadcasti32x4 %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 9
+	vbroadcasti32x4 %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, zmm16, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm18, zmm16, 1
+		vpxord		zmm18, zmm18, zmm14
+%endif
+	; round 10
+	vbroadcasti32x4 %%T0, [keys + 16*10]
+	vaesenclast  %%ST1, %%T0
+	vaesenclast  %%ST2, %%T0
+	vaesenclast  %%ST3, %%T0
+	vaesenclast  %%ST4, %%T0
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+	vpxorq    %%ST3, %%TW3
+	vpxorq    %%ST4, %%TW4
+
+	; load next Tweak values
+	vmovdqa32  %%TW1, zmm15
+	vmovdqa32  %%TW2, zmm16
+	vmovdqa32  %%TW3, zmm17
+	vmovdqa32  %%TW4, zmm18
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_enc_vaes, function
+XTS_AES_128_enc_vaes:
+	endbranch
+
+%define ALIGN_STACK
+%ifdef ALIGN_STACK
+	push		rbp
+	mov		rbp, rsp
+	sub		rsp, VARIABLE_OFFSET
+	and		rsp, ~63
+%else
+	sub		rsp, VARIABLE_OFFSET
+%endif
+
+	mov		[_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov		[_gpr + 8*1], rdi
+	mov		[_gpr + 8*2], rsi
+
+	vmovdqa		[_xmm + 16*0], xmm6
+	vmovdqa		[_xmm + 16*1], xmm7
+	vmovdqa		[_xmm + 16*2], xmm8
+	vmovdqa		[_xmm + 16*3], xmm9
+	vmovdqa		[_xmm + 16*4], xmm10
+	vmovdqa		[_xmm + 16*5], xmm11
+	vmovdqa		[_xmm + 16*6], xmm12
+	vmovdqa		[_xmm + 16*7], xmm13
+	vmovdqa		[_xmm + 16*8], xmm14
+	vmovdqa		[_xmm + 16*9], xmm15
+%endif
+
+	mov		ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	vmovdqu		xmm1, [T_val]                   ; read initial Tweak value
+	vpxor		xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]	; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]	; ciphertext pointer
+%endif
+
+	cmp		N_val, 128
+	jl              _less_than_128_bytes
+
+	vpbroadcastq	zpoly, ghash_poly_8b
+
+	cmp		N_val, 256
+	jge		_start_by16
+
+	cmp		N_val, 128
+	jge		_start_by8
+
+_do_n_blocks:
+	cmp		N_val, 0
+	je		_ret_
+
+	cmp		N_val, (7*16)
+	jge		_remaining_num_blocks_is_7
+
+	cmp		N_val, (6*16)
+	jge		_remaining_num_blocks_is_6
+
+	cmp		N_val, (5*16)
+	jge		_remaining_num_blocks_is_5
+
+	cmp		N_val, (4*16)
+	jge		_remaining_num_blocks_is_4
+
+	cmp		N_val, (3*16)
+	jge		_remaining_num_blocks_is_3
+
+	cmp		N_val, (2*16)
+	jge		_remaining_num_blocks_is_2
+
+	cmp		N_val, (1*16)
+	jge		_remaining_num_blocks_is_1
+
+;; _remaining_num_blocks_is_0:
+	vmovdqa		xmm8, xmm0
+	vmovdqa		xmm0, xmm9
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_7:
+	mov		tmp1, -1
+	shr		tmp1, 16
+	kmovq		k1, tmp1
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2 {k1}, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*7
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4] {k1}, zmm2
+	add		ptr_ciphertext, 16*7
+
+	vextracti32x4	xmm8, zmm2, 0x2
+	vextracti32x4	xmm0, zmm10, 0x3
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_6:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	ymm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*6
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], ymm2
+	add		ptr_ciphertext, 16*6
+
+	vextracti32x4	xmm8, zmm2, 0x1
+	vextracti32x4	xmm0, zmm10, 0x2
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_5:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*5
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu		[ptr_ciphertext+16*4], xmm2
+	add		ptr_ciphertext, 16*5
+
+	movdqa		xmm8, xmm2
+	vextracti32x4	xmm0, zmm10, 0x1
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_4:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	add		ptr_plaintext, 16*4
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	add		ptr_ciphertext, 16*4
+
+	vextracti32x4	xmm8, zmm1, 0x3
+	vextracti32x4	xmm0, zmm10, 0x0
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_3:
+	vextracti32x4	xmm10, zmm9, 1
+	vextracti32x4	xmm11, zmm9, 2
+	vmovdqu		xmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*1]
+	vmovdqu		xmm3, [ptr_plaintext+16*2]
+	add		ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	add		ptr_ciphertext, 16*3
+
+	vmovdqa		xmm8, xmm3
+	vextracti32x4	xmm0, zmm9, 3
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_2:
+	vextracti32x4	xmm10, zmm9, 1
+	vmovdqu		xmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*1]
+	add		ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	add		ptr_ciphertext, 16*2
+
+	vmovdqa		xmm8, xmm2
+	vextracti32x4	xmm0, zmm9, 2
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_1:
+	vmovdqu		xmm1, [ptr_plaintext]
+	add		ptr_plaintext, 16
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16
+
+	vmovdqa		xmm8, xmm1
+	vextracti32x4	xmm0, zmm9, 1
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+
+_start_by16:
+	; Make first 7 tweak values
+	vbroadcasti32x4	zmm0, [TW]
+	vbroadcasti32x4	zmm8, [shufb_15_7]
+	mov		tmp1, 0xaa
+	kmovq		k2, tmp1
+
+	; Mult tweak by 2^{3, 2, 1, 0}
+	vpshufb		zmm1, zmm0, zmm8		; mov 15->0, 7->8
+	vpsllvq		zmm4, zmm0, [const_dq3210]	; shift l 3,2,1,0
+	vpsrlvq		zmm2, zmm1, [const_dq5678]	; shift r 5,6,7,8
+	vpclmulqdq      zmm3, zmm2, zpoly, 0x00
+	vpxorq		zmm4 {k2}, zmm4, zmm2		; tweaks shifted by 3-0
+	vpxord		zmm9, zmm3, zmm4
+
+	; Mult tweak by 2^{7, 6, 5, 4}
+	vpsllvq		zmm5, zmm0, [const_dq7654]	; shift l 7,6,5,4
+	vpsrlvq		zmm6, zmm1, [const_dq1234]	; shift r 1,2,3,4
+	vpclmulqdq      zmm7, zmm6, zpoly, 0x00
+	vpxorq		zmm5 {k2}, zmm5, zmm6		; tweaks shifted by 7-4
+	vpxord		zmm10, zmm7, zmm5
+
+	; Make next 8 tweak values by all x 2^8
+	vpsrldq		zmm13, zmm9, 15
+	vpclmulqdq	zmm14, zmm13, zpoly, 0
+	vpslldq		zmm11, zmm9, 1
+	vpxord		zmm11, zmm11, zmm14
+
+	vpsrldq		zmm15, zmm10, 15
+	vpclmulqdq	zmm16, zmm15, zpoly, 0
+	vpslldq		zmm12, zmm10, 1
+	vpxord		zmm12, zmm12, zmm16
+
+_main_loop_run_16:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2, [ptr_plaintext+16*4]
+	vmovdqu8	zmm3, [ptr_plaintext+16*8]
+	vmovdqu8	zmm4, [ptr_plaintext+16*12]
+	add		ptr_plaintext, 256
+
+	encrypt_by_16_zmm  zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
+
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], zmm2
+	vmovdqu8	[ptr_ciphertext+16*8], zmm3
+	vmovdqu8	[ptr_ciphertext+16*12], zmm4
+	add		ptr_ciphertext, 256
+	sub		N_val, 256
+
+	cmp		N_val, 256
+	jge		_main_loop_run_16
+
+	cmp		N_val, 128
+	jge		_main_loop_run_8
+
+	vextracti32x4	xmm0, zmm4, 0x3 ; keep last crypted block
+	jmp		_do_n_blocks
+
+_start_by8:
+	; Make first 7 tweak values
+	vbroadcasti32x4	zmm0, [TW]
+	vbroadcasti32x4	zmm8, [shufb_15_7]
+	mov		tmp1, 0xaa
+	kmovq		k2, tmp1
+
+	; Mult tweak by 2^{3, 2, 1, 0}
+	vpshufb		zmm1, zmm0, zmm8		; mov 15->0, 7->8
+	vpsllvq		zmm4, zmm0, [const_dq3210]	; shift l 3,2,1,0
+	vpsrlvq		zmm2, zmm1, [const_dq5678]	; shift r 5,6,7,8
+	vpclmulqdq      zmm3, zmm2, zpoly, 0x00
+	vpxorq		zmm4 {k2}, zmm4, zmm2		; tweaks shifted by 3-0
+	vpxord		zmm9, zmm3, zmm4
+
+	; Mult tweak by 2^{7, 6, 5, 4}
+	vpsllvq		zmm5, zmm0, [const_dq7654]	; shift l 7,6,5,4
+	vpsrlvq		zmm6, zmm1, [const_dq1234]	; shift r 1,2,3,4
+	vpclmulqdq      zmm7, zmm6, zpoly, 0x00
+	vpxorq		zmm5 {k2}, zmm5, zmm6		; tweaks shifted by 7-4
+	vpxord		zmm10, zmm7, zmm5
+
+_main_loop_run_8:
+	; load plaintext
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 128
+
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 0
+
+	; store ciphertext
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], zmm2
+	add		ptr_ciphertext, 128
+	sub		N_val, 128
+
+	cmp		N_val, 128
+	jge		_main_loop_run_8
+
+	vextracti32x4	xmm0, zmm2, 0x3 ; keep last crypted block
+	jmp		_do_n_blocks
+
+_steal_cipher_next:
+	; generate next Tweak value
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW], twtempl
+	mov		[TW + 8], twtemph
+	vmovdqa		xmm0, [TW]
+
+_steal_cipher:
+	; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
+	vmovdqa		xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea		twtempl, [vpshufb_shf_table]
+	vmovdqu		xmm10, [twtempl+N_val]
+	vpshufb		xmm8, xmm10
+
+	vmovdqu		xmm3, [ptr_plaintext - 16 + N_val]
+	vmovdqu		[ptr_ciphertext - 16 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea		twtempl, [vpshufb_shf_table +16]
+	sub		twtempl, N_val
+	vmovdqu		xmm10, [twtempl]
+	vpxor		xmm10, [mask1]
+	vpshufb		xmm3, xmm10
+
+	vpblendvb	xmm3, xmm3, xmm2, xmm10
+
+	; xor Tweak value
+	vpxor		xmm8, xmm3, xmm0
+
+	;encrypt last block with cipher stealing
+	vpxor		xmm8, [keys]		; ARK
+	vaesenc		xmm8, [keys + 16*1]	; round 1
+	vaesenc		xmm8, [keys + 16*2]	; round 2
+	vaesenc		xmm8, [keys + 16*3]	; round 3
+	vaesenc		xmm8, [keys + 16*4]	; round 4
+	vaesenc		xmm8, [keys + 16*5]	; round 5
+	vaesenc		xmm8, [keys + 16*6]	; round 6
+	vaesenc		xmm8, [keys + 16*7]	; round 7
+	vaesenc		xmm8, [keys + 16*8]	; round 8
+	vaesenc		xmm8, [keys + 16*9]	; round 9
+	vaesenclast	xmm8, [keys + 16*10]	; round 10
+
+	; xor Tweak value
+	vpxor		xmm8, xmm8, xmm0
+
+	; store last ciphertext value
+	vmovdqu		[ptr_ciphertext - 16], xmm8
+
+_ret_:
+	mov		rbx, [_gpr + 8*0]
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov		rdi, [_gpr + 8*1]
+	mov		rsi, [_gpr + 8*2]
+
+	vmovdqa		xmm6, [_xmm + 16*0]
+	vmovdqa		xmm7, [_xmm + 16*1]
+	vmovdqa		xmm8, [_xmm + 16*2]
+	vmovdqa		xmm9, [_xmm + 16*3]
+	vmovdqa		xmm10, [_xmm + 16*4]
+	vmovdqa		xmm11, [_xmm + 16*5]
+	vmovdqa		xmm12, [_xmm + 16*6]
+	vmovdqa		xmm13, [_xmm + 16*7]
+	vmovdqa		xmm14, [_xmm + 16*8]
+	vmovdqa		xmm15, [_xmm + 16*9]
+%endif
+
+%ifndef ALIGN_STACK
+	add		rsp, VARIABLE_OFFSET
+%else
+	mov		rsp, rbp
+	pop		rbp
+%endif
+	ret
+
+
+_less_than_128_bytes:
+	cmp		N_val, 16
+	jb		_ret_
+
+	mov		tmp1, N_val
+	and		tmp1, (7*16)
+	cmp		tmp1, (6*16)
+	je		_num_blocks_is_6
+	cmp		tmp1, (5*16)
+	je		_num_blocks_is_5
+	cmp		tmp1, (4*16)
+	je		_num_blocks_is_4
+	cmp		tmp1, (3*16)
+	je		_num_blocks_is_3
+	cmp		tmp1, (2*16)
+	je		_num_blocks_is_2
+	cmp		tmp1, (1*16)
+	je		_num_blocks_is_1
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add		ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+	vmovdqu		[ptr_ciphertext+16*5], xmm6
+	vmovdqu		[ptr_ciphertext+16*6], xmm7
+	add		ptr_ciphertext, 16*7
+	vmovdqa		xmm8, xmm7
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add		ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+	vmovdqu		[ptr_ciphertext+16*5], xmm6
+
+	add		ptr_ciphertext, 16*6
+	vmovdqa		xmm8, xmm6
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add		ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+
+	add		ptr_ciphertext, 16*5
+	vmovdqa		xmm8, xmm5
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add		ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+
+	add		ptr_ciphertext, 16*4
+	vmovdqa		xmm8, xmm4
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add		ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+
+	add		ptr_ciphertext, 16*3
+	vmovdqa		xmm8, xmm3
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add		ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext], xmm1
+	vmovdqu		[ptr_ciphertext+16], xmm2
+
+	add		ptr_ciphertext, 16*2
+	vmovdqa		xmm8, xmm2
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add		ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16*1
+	vmovdqa		xmm8, xmm1
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
+const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
+const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
+const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
+
+shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+%else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_XTS_AES_128_enc_vaes
+no_XTS_AES_128_enc_vaes:
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm
new file mode 100644
index 000000000..776525bdd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm
@@ -0,0 +1,1962 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*23     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*23     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*33     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_avx(
+;               UINT8 *k2,      // key used for tweaking, 16*2 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*2 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *ct,        // ciphertext sector input data
+;               UINT8 *pt);     // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define target_ptr_val          rsi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define target_ptr_val          rdx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro  key_expansion_256_flip  3
+%define %%xraw_key      %1
+%define %%xtmp  %2
+%define %%xround_key    %3
+	vpshufd  %%xraw_key,  %%xraw_key, 11111111b
+	vshufps  %%xtmp, %%xround_key, 00010000b
+	vpxor    %%xround_key, %%xtmp
+	vshufps  %%xtmp, %%xround_key, 10001100b
+	vpxor    %%xround_key, %%xtmp
+	vpxor    %%xround_key,  %%xraw_key
+%endmacro
+
+%macro  key_expansion_256_flop  3
+%define %%xraw_key      %1
+%define %%xtmp  %2
+%define %%xround_key    %3
+	vpshufd  %%xraw_key,  %%xraw_key, 10101010b
+	vshufps  %%xtmp, %%xround_key, 00010000b
+	vpxor    %%xround_key, %%xtmp
+	vshufps  %%xtmp, %%xround_key, 10001100b
+	vpxor    %%xround_key, %%xtmp
+	vpxor    %%xround_key,  %%xraw_key
+%endmacro
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 11
+%define %%xkey2         %1
+%define %%xkey2_2       %2
+%define %%xstate_tweak  %3
+%define %%xkey1         %4
+%define %%xkey1_2       %5
+%define %%xraw_key      %6
+%define %%xtmp          %7
+%define %%xtmp2         %8
+%define %%ptr_key2      %9
+%define %%ptr_key1      %10
+%define %%ptr_expanded_keys     %11
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2]
+	vpxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1]
+	vmovdqa  [%%ptr_expanded_keys+16*14], %%xkey1
+
+	vmovdqu  %%xkey2_2, [%%ptr_key2 + 16*1]
+	vaesenc  %%xstate_tweak, %%xkey2_2                       ; round 1 for tweak encryption
+
+	vmovdqu  %%xkey1_2, [%%ptr_key1 + 16*1]
+	vaesimc  %%xtmp2, %%xkey1_2
+	vmovdqa  [%%ptr_expanded_keys+16*13], %%xtmp2
+
+
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x1      ; Generating round key 2 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x1      ; Generating round key 2 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 2 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys+16*12], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x1        ; Generating round key 3 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x1        ; Generating round key 3 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 3 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1_2
+	vmovdqa                  [%%ptr_expanded_keys+16*11], %%xtmp2
+
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x2      ; Generating round key 4 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x2      ; Generating round key 4 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 4 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys+16*10], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x2        ; Generating round key 5 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x2        ; Generating round key 5 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 5 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1_2
+	vmovdqa                  [%%ptr_expanded_keys+16*9], %%xtmp2
+
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x4      ; Generating round key 6 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x4      ; Generating round key 6 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 6 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys+16*8], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x4        ; Generating round key 7 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x4        ; Generating round key 7 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 7 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1_2
+	vmovdqa                  [%%ptr_expanded_keys+16*7], %%xtmp2
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x8      ; Generating round key 8 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x8      ; Generating round key 8 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 8 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys+16*6], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x8        ; Generating round key 9 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x8        ; Generating round key 9 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 9 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1_2
+	vmovdqa                  [%%ptr_expanded_keys+16*5], %%xtmp2
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x10     ; Generating round key 10 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x10     ; Generating round key 10 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 10 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys+16*4], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x10       ; Generating round key 11 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x10       ; Generating round key 11 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 11 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1_2
+	vmovdqa                  [%%ptr_expanded_keys+16*3], %%xtmp2
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x20     ; Generating round key 12 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x20     ; Generating round key 12 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 12 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys+16*2], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x20       ; Generating round key 13 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x20       ; Generating round key 13 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 13 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1_2
+	vmovdqa                  [%%ptr_expanded_keys+16*1], %%xtmp2
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x40     ; Generating round key 14 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x40     ; Generating round key 14 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenclast              %%xstate_tweak, %%xkey2         ; round 14 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*0], %%xkey1
+
+	vmovdqa  [TW], %%xstate_tweak    ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		vmovdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		vmovdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		vmovdqa  %%TW2, [TW+16*1]
+		vmovdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		vmovdqa  %%TW3, [TW+16*2]
+		vmovdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		vmovdqa  %%TW4, [TW+16*3]
+		vmovdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		vmovdqa  %%TW5, [TW+16*4]
+		vmovdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		vmovdqa  %%TW6, [TW+16*5]
+		vmovdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		vmovdqa  %%TW7, [TW+16*6]
+		vmovdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro  encrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	vmovdqa  %%T0, [keys]
+	vpxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	vmovdqa  %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	vmovdqa  %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	vmovdqa  %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	vmovdqa  %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	vmovdqa  %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	vmovdqa  %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	vmovdqa  %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	vmovdqa  %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	vmovdqa  %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+	; round 10
+	vmovdqa  %%T0, [keys + 16*10]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	; round 11
+	vmovdqa  %%T0, [keys + 16*11]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	; round 12
+	vmovdqa  %%T0, [keys + 16*12]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	; round 13
+	vmovdqa  %%T0, [keys + 16*13]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	; round 14
+	vmovdqa  %%T0, [keys + 16*14]
+	vaesdeclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdeclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdeclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdeclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdeclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdeclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdeclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_eight 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%TW8   %16     ; tweak 8
+%define %%T0    %17     ; Temp register
+%define %%last_eight     %18
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+	vpxor    %%ST2, %%TW2
+	vpxor    %%ST3, %%TW3
+	vpxor    %%ST4, %%TW4
+	vpxor    %%ST5, %%TW5
+	vpxor    %%ST6, %%TW6
+	vpxor    %%ST7, %%TW7
+	vpxor    %%ST8, %%TW8
+
+	; ARK
+	vmovdqa %%T0, [keys]
+	vpxor    %%ST1, %%T0
+	vpxor    %%ST2, %%T0
+	vpxor    %%ST3, %%T0
+	vpxor    %%ST4, %%T0
+	vpxor    %%ST5, %%T0
+	vpxor    %%ST6, %%T0
+	vpxor    %%ST7, %%T0
+	vpxor    %%ST8, %%T0
+
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 1
+	vmovdqa %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 2
+	vmovdqa %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+
+%endif
+	; round 3
+	vmovdqa %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*2], twtempl
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 4
+	vmovdqa %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl
+%endif
+	; round 5
+	vmovdqa %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 6
+	vmovdqa %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl
+		mov     [TW + 8*7], twtemph
+%endif
+	; round 7
+	vmovdqa %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 8
+	vmovdqa %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl
+		mov     [TW + 8*9], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 9
+	vmovdqa %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+%endif
+	; round 10
+	vmovdqa %%T0, [keys + 16*10]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*10], twtempl
+		mov     [TW + 8*11], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 11
+	vmovdqa %%T0, [keys + 16*11]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl
+%endif
+	; round 12
+	vmovdqa %%T0, [keys + 16*12]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*13], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 13
+	vmovdqa %%T0, [keys + 16*13]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+;               mov     [TW + 8*14], twtempl
+;               mov     [TW + 8*15], twtemph
+%endif
+	; round 14
+	vmovdqa %%T0, [keys + 16*14]
+	vaesdeclast      %%ST1, %%T0
+	vaesdeclast      %%ST2, %%T0
+	vaesdeclast      %%ST3, %%T0
+	vaesdeclast      %%ST4, %%T0
+	vaesdeclast      %%ST5, %%T0
+	vaesdeclast      %%ST6, %%T0
+	vaesdeclast      %%ST7, %%T0
+	vaesdeclast      %%ST8, %%T0
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+	vpxor    %%ST2, %%TW2
+	vpxor    %%ST3, %%TW3
+	vpxor    %%ST4, %%TW4
+	vpxor    %%ST5, %%TW5
+	vpxor    %%ST6, %%TW6
+	vpxor    %%ST7, %%TW7
+	vpxor    %%ST8, %%TW8
+
+		mov     [TW + 8*14], twtempl
+		mov     [TW + 8*15], twtemph
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_dec_avx, function
+XTS_AES_256_dec_avx:
+	endbranch
+
+	sub     rsp, VARIABLE_OFFSET
+
+	mov     [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [_gpr + 8*1], rdi
+	mov     [_gpr + 8*2], rsi
+
+	vmovdqa  [_xmm + 16*0], xmm6
+	vmovdqa  [_xmm + 16*1], xmm7
+	vmovdqa  [_xmm + 16*2], xmm8
+	vmovdqa  [_xmm + 16*3], xmm9
+	vmovdqa  [_xmm + 16*4], xmm10
+	vmovdqa  [_xmm + 16*5], xmm11
+	vmovdqa  [_xmm + 16*6], xmm12
+	vmovdqa  [_xmm + 16*7], xmm13
+	vmovdqa  [_xmm + 16*8], xmm14
+	vmovdqa  [_xmm + 16*9], xmm15
+%endif
+
+	mov     ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	vmovdqu  xmm1, [T_val]                   ; read initial Tweak value
+	vpxor    xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, xmm7, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]            ; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]           ; ciphertext pointer
+%endif
+
+
+
+	mov             target_ptr_val, N_val
+	and             target_ptr_val, -16             ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+	sub             target_ptr_val, 128             ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+	jl              _less_than_128_bytes
+
+	add             target_ptr_val, ptr_ciphertext
+
+
+	mov             tmp1, N_val
+	and             tmp1, (7 << 4)
+	jz              _initial_num_blocks_is_0
+
+	cmp             tmp1, (4 << 4)
+	je              _initial_num_blocks_is_4
+
+
+
+	cmp             tmp1, (6 << 4)
+	je              _initial_num_blocks_is_6
+
+	cmp             tmp1, (5 << 4)
+	je              _initial_num_blocks_is_5
+
+
+
+	cmp             tmp1, (3 << 4)
+	je              _initial_num_blocks_is_3
+
+	cmp             tmp1, (2 << 4)
+	je              _initial_num_blocks_is_2
+
+	cmp             tmp1, (1 << 4)
+	je              _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	add     ptr_ciphertext, 16*7
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	add     ptr_ciphertext, 16*6
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	add     ptr_ciphertext, 16*5
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	add     ptr_ciphertext, 16*4
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+
+_initial_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	add     ptr_ciphertext, 16*3
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+	vmovdqu  [ptr_ciphertext+16], xmm2
+	add     ptr_ciphertext, 16*2
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+	add     ptr_ciphertext, 16
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_0:
+		mov             twtempl, [TW+8*0]
+		mov             twtemph, [TW+8*1]
+		vmovdqa          xmm9, [TW+16*0]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*2], twtempl
+		mov             [TW+8*3], twtemph
+		vmovdqa          xmm10, [TW+16*1]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*4], twtempl
+		mov             [TW+8*5], twtemph
+		vmovdqa          xmm11, [TW+16*2]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*6], twtempl
+		mov             [TW+8*7], twtemph
+		vmovdqa          xmm12, [TW+16*3]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*8], twtempl
+		mov             [TW+8*9], twtemph
+		vmovdqa          xmm13, [TW+16*4]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*10], twtempl
+		mov             [TW+8*11], twtemph
+		vmovdqa          xmm14, [TW+16*5]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*12], twtempl
+		mov             [TW+8*13], twtemph
+		vmovdqa          xmm15, [TW+16*6]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*14], twtempl
+		mov             [TW+8*15], twtemph
+		;vmovdqa         xmm16, [TW+16*7]
+
+		cmp             ptr_ciphertext, target_ptr_val
+		je              _last_eight
+_main_loop:
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+
+	add     ptr_plaintext, 128
+
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	vmovdqu  [ptr_ciphertext+16*7], xmm8
+	add     ptr_ciphertext, 128
+
+	cmp     ptr_ciphertext, target_ptr_val
+	jne     _main_loop
+
+_last_eight:
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_final
+
+	; generate next Tweak value
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	vmovdqa  xmm1, [TW + 16*7]
+	vmovdqa  [TW + 16*0], xmm1       ; swap tweak values for cipher stealing for decrypt
+
+	mov     [TW + 16*7], twtempl
+	mov     [TW + 16*7+8], twtemph
+
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	jmp     _steal_cipher
+
+
+_done_final:
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+
+	jmp      _done
+
+
+_steal_cipher:
+	; start cipher stealing
+
+
+	vmovdqa  xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea     twtempl, [vpshufb_shf_table]
+	vmovdqu  xmm0, [twtempl+N_val]
+	vpshufb  xmm8, xmm0
+
+
+	vmovdqu  xmm3, [ptr_plaintext + 112 + N_val]      ; state register is temporarily xmm3 to eliminate a move
+	vmovdqu  [ptr_ciphertext + 112 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea     twtempl, [vpshufb_shf_table +16]
+	sub     twtempl, N_val
+	vmovdqu  xmm0, [twtempl]
+	vpxor    xmm0, [mask1]
+	vpshufb  xmm3, xmm0
+
+	vpblendvb       xmm3,  xmm3, xmm2, xmm0      ;xmm0 is implicit
+
+	; xor Tweak value
+	vmovdqa  xmm8, [TW]
+	vpxor    xmm8, xmm3      ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+	;encrypt last block with cipher stealing
+	vpxor    xmm8, [keys]                    ; ARK
+	vaesdec  xmm8, [keys + 16*1]             ; round 1
+	vaesdec  xmm8, [keys + 16*2]             ; round 2
+	vaesdec  xmm8, [keys + 16*3]             ; round 3
+	vaesdec  xmm8, [keys + 16*4]             ; round 4
+	vaesdec  xmm8, [keys + 16*5]             ; round 5
+	vaesdec  xmm8, [keys + 16*6]             ; round 6
+	vaesdec  xmm8, [keys + 16*7]             ; round 7
+	vaesdec  xmm8, [keys + 16*8]             ; round 8
+	vaesdec  xmm8, [keys + 16*9]             ; round 9
+	vaesdec  xmm8, [keys + 16*10]            ; round 9
+	vaesdec  xmm8, [keys + 16*11]            ; round 9
+	vaesdec  xmm8, [keys + 16*12]            ; round 9
+	vaesdec  xmm8, [keys + 16*13]            ; round 9
+	vaesdeclast      xmm8, [keys + 16*14]    ; round 10
+
+	; xor Tweak value
+	vpxor    xmm8, [TW]
+
+_done:
+	; store last ciphertext value
+	vmovdqu  [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+	mov     rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rdi, [_gpr + 8*1]
+	mov     rsi, [_gpr + 8*2]
+
+
+	vmovdqa  xmm6, [_xmm + 16*0]
+	vmovdqa  xmm7, [_xmm + 16*1]
+	vmovdqa  xmm8, [_xmm + 16*2]
+	vmovdqa  xmm9, [_xmm + 16*3]
+	vmovdqa  xmm10, [_xmm + 16*4]
+	vmovdqa  xmm11, [_xmm + 16*5]
+	vmovdqa  xmm12, [_xmm + 16*6]
+	vmovdqa  xmm13, [_xmm + 16*7]
+	vmovdqa  xmm14, [_xmm + 16*8]
+	vmovdqa  xmm15, [_xmm + 16*9]
+%endif
+
+	add     rsp, VARIABLE_OFFSET
+
+	ret
+
+
+
+
+
+_less_than_128_bytes:
+	cmp N_val, 16
+	jb _ret_
+
+	mov     tmp1, N_val
+	and     tmp1, (7 << 4)
+	cmp     tmp1, (6 << 4)
+	je      _num_blocks_is_6
+	cmp     tmp1, (5 << 4)
+	je      _num_blocks_is_5
+	cmp     tmp1, (4 << 4)
+	je      _num_blocks_is_4
+	cmp     tmp1, (3 << 4)
+	je      _num_blocks_is_3
+	cmp     tmp1, (2 << 4)
+	je      _num_blocks_is_2
+	cmp     tmp1, (1 << 4)
+	je      _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+	sub     ptr_plaintext, 16*1
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_7
+
+_steal_cipher_7:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm15
+	vmovdqa  xmm15, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	vmovdqa  xmm8, xmm7
+	jmp     _steal_cipher
+
+_done_7:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	vmovdqa  xmm8, xmm7
+	jmp     _done
+
+
+
+
+
+
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+	sub     ptr_plaintext, 16*2
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_6
+
+_steal_cipher_6:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm14
+	vmovdqa  xmm14, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	vmovdqa  xmm8, xmm6
+	jmp     _steal_cipher
+
+_done_6:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	vmovdqa  xmm8, xmm6
+	jmp     _done
+
+
+
+
+
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+	sub     ptr_plaintext, 16*3
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_5
+
+_steal_cipher_5:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm13
+	vmovdqa  xmm13, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	vmovdqa  xmm8, xmm5
+	jmp     _steal_cipher
+
+_done_5:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	vmovdqa  xmm8, xmm5
+	jmp     _done
+
+
+
+
+
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+	sub     ptr_plaintext, 16*4
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_4
+
+_steal_cipher_4:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm12
+	vmovdqa  xmm12, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	vmovdqa  xmm8, xmm4
+	jmp     _steal_cipher
+
+_done_4:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	vmovdqa  xmm8, xmm4
+	jmp     _done
+
+
+
+
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+	sub     ptr_plaintext, 16*5
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_3
+
+_steal_cipher_3:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm11
+	vmovdqa  xmm11, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	vmovdqa  xmm8, xmm3
+	jmp     _steal_cipher
+
+_done_3:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	vmovdqa  xmm8, xmm3
+	jmp     _done
+
+
+
+
+
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+	sub     ptr_plaintext, 16*6
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_2
+
+_steal_cipher_2:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm10
+	vmovdqa  xmm10, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	vmovdqa  xmm8, xmm2
+	jmp     _steal_cipher
+
+_done_2:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	vmovdqa  xmm8, xmm2
+	jmp     _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+	sub     ptr_plaintext, 16*7
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_1
+
+_steal_cipher_1:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm9
+	vmovdqa  xmm9, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	vmovdqa  xmm8, xmm1
+	jmp     _steal_cipher
+
+_done_1:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	vmovdqa  xmm8, xmm1
+	jmp     _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm
new file mode 100644
index 000000000..d52d0977e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm
@@ -0,0 +1,1896 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*23     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*23     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*33     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_expanded_key_avx(
+;               UINT8 *k2,      // key used for tweaking, 16*15 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*15 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *ct,        // ciphertext sector input data
+;               UINT8 *pt);     // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define target_ptr_val          rsi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define target_ptr_val          rdx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro  encrypt_T 8
+%define %%xkey2         %1
+%define %%xstate_tweak  %2
+%define %%xkey1         %3
+%define %%xraw_key      %4
+%define %%xtmp          %5
+%define %%ptr_key2      %6
+%define %%ptr_key1      %7
+%define %%ptr_expanded_keys     %8
+
+	vmovdqu  %%xkey2, [%%ptr_key2]
+	vpxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*14]
+	vmovdqa  [%%ptr_expanded_keys+16*14], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*1]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 1 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*13]
+	vmovdqa  [%%ptr_expanded_keys+16*13], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*2]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 2 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*12]
+	vmovdqa  [%%ptr_expanded_keys+16*12], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*3]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 3 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*11]
+	vmovdqa  [%%ptr_expanded_keys+16*11], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*4]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 4 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*10]
+	vmovdqa  [%%ptr_expanded_keys+16*10], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*5]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 5 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*9]
+	vmovdqa  [%%ptr_expanded_keys+16*9], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*6]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 6 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*8]
+	vmovdqa  [%%ptr_expanded_keys+16*8], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*7]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 7 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*7]
+	vmovdqa  [%%ptr_expanded_keys+16*7], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*8]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 8 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*6]
+	vmovdqa  [%%ptr_expanded_keys+16*6], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*9]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 9 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*5]
+	vmovdqa  [%%ptr_expanded_keys+16*5], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*10]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 10 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*4]
+	vmovdqa  [%%ptr_expanded_keys+16*4], %%xkey1            ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*11]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 11 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*3]
+	vmovdqa  [%%ptr_expanded_keys+16*3], %%xkey1            ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*12]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 12 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*2]
+	vmovdqa  [%%ptr_expanded_keys+16*2], %%xkey1            ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*13]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 13 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*1]
+	vmovdqa  [%%ptr_expanded_keys+16*1], %%xkey1            ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*14]
+	vaesenclast      %%xstate_tweak, %%xkey2                 ; round 14 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*0]
+	vmovdqa  [%%ptr_expanded_keys+16*0], %%xkey1            ; store round keys in stack
+
+	vmovdqa  [TW], %%xstate_tweak                            ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		vmovdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		vmovdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		vmovdqa  %%TW2, [TW+16*1]
+		vmovdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		vmovdqa  %%TW3, [TW+16*2]
+		vmovdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		vmovdqa  %%TW4, [TW+16*3]
+		vmovdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		vmovdqa  %%TW5, [TW+16*4]
+		vmovdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		vmovdqa  %%TW6, [TW+16*5]
+		vmovdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		vmovdqa  %%TW7, [TW+16*6]
+		vmovdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro  encrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	vmovdqa  %%T0, [keys]
+	vpxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	vmovdqa  %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	vmovdqa  %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	vmovdqa  %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	vmovdqa  %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	vmovdqa  %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	vmovdqa  %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	vmovdqa  %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	vmovdqa  %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	vmovdqa  %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+	; round 10
+	vmovdqa  %%T0, [keys + 16*10]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	; round 11
+	vmovdqa  %%T0, [keys + 16*11]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	; round 12
+	vmovdqa  %%T0, [keys + 16*12]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	; round 13
+	vmovdqa  %%T0, [keys + 16*13]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	; round 14
+	vmovdqa  %%T0, [keys + 16*14]
+	vaesdeclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdeclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdeclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdeclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdeclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdeclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdeclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_eight 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%TW8   %16     ; tweak 8
+%define %%T0    %17     ; Temp register
+%define %%last_eight     %18
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+	vpxor    %%ST2, %%TW2
+	vpxor    %%ST3, %%TW3
+	vpxor    %%ST4, %%TW4
+	vpxor    %%ST5, %%TW5
+	vpxor    %%ST6, %%TW6
+	vpxor    %%ST7, %%TW7
+	vpxor    %%ST8, %%TW8
+
+	; ARK
+	vmovdqa %%T0, [keys]
+	vpxor    %%ST1, %%T0
+	vpxor    %%ST2, %%T0
+	vpxor    %%ST3, %%T0
+	vpxor    %%ST4, %%T0
+	vpxor    %%ST5, %%T0
+	vpxor    %%ST6, %%T0
+	vpxor    %%ST7, %%T0
+	vpxor    %%ST8, %%T0
+
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 1
+	vmovdqa %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 2
+	vmovdqa %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+
+%endif
+	; round 3
+	vmovdqa %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*2], twtempl
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 4
+	vmovdqa %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl
+%endif
+	; round 5
+	vmovdqa %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 6
+	vmovdqa %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl
+		mov     [TW + 8*7], twtemph
+%endif
+	; round 7
+	vmovdqa %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 8
+	vmovdqa %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl
+		mov     [TW + 8*9], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 9
+	vmovdqa %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+%endif
+	; round 10
+	vmovdqa %%T0, [keys + 16*10]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*10], twtempl
+		mov     [TW + 8*11], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 11
+	vmovdqa %%T0, [keys + 16*11]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl
+%endif
+	; round 12
+	vmovdqa %%T0, [keys + 16*12]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*13], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 13
+	vmovdqa %%T0, [keys + 16*13]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+	vaesdec  %%ST5, %%T0
+	vaesdec  %%ST6, %%T0
+	vaesdec  %%ST7, %%T0
+	vaesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+;               mov     [TW + 8*14], twtempl
+;               mov     [TW + 8*15], twtemph
+%endif
+	; round 14
+	vmovdqa %%T0, [keys + 16*14]
+	vaesdeclast      %%ST1, %%T0
+	vaesdeclast      %%ST2, %%T0
+	vaesdeclast      %%ST3, %%T0
+	vaesdeclast      %%ST4, %%T0
+	vaesdeclast      %%ST5, %%T0
+	vaesdeclast      %%ST6, %%T0
+	vaesdeclast      %%ST7, %%T0
+	vaesdeclast      %%ST8, %%T0
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+	vpxor    %%ST2, %%TW2
+	vpxor    %%ST3, %%TW3
+	vpxor    %%ST4, %%TW4
+	vpxor    %%ST5, %%TW5
+	vpxor    %%ST6, %%TW6
+	vpxor    %%ST7, %%TW7
+	vpxor    %%ST8, %%TW8
+
+		mov     [TW + 8*14], twtempl
+		mov     [TW + 8*15], twtemph
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_dec_expanded_key_avx, function
+XTS_AES_256_dec_expanded_key_avx:
+	endbranch
+
+	sub     rsp, VARIABLE_OFFSET
+
+	mov     [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [_gpr + 8*1], rdi
+	mov     [_gpr + 8*2], rsi
+
+	vmovdqa  [_xmm + 16*0], xmm6
+	vmovdqa  [_xmm + 16*1], xmm7
+	vmovdqa  [_xmm + 16*2], xmm8
+	vmovdqa  [_xmm + 16*3], xmm9
+	vmovdqa  [_xmm + 16*4], xmm10
+	vmovdqa  [_xmm + 16*5], xmm11
+	vmovdqa  [_xmm + 16*6], xmm12
+	vmovdqa  [_xmm + 16*7], xmm13
+	vmovdqa  [_xmm + 16*8], xmm14
+	vmovdqa  [_xmm + 16*9], xmm15
+%endif
+
+	mov     ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	vmovdqu  xmm1, [T_val]                   ; read initial Tweak value
+	vpxor    xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]            ; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]           ; ciphertext pointer
+%endif
+
+
+
+	mov             target_ptr_val, N_val
+	and             target_ptr_val, -16             ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+	sub             target_ptr_val, 128             ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+	jl              _less_than_128_bytes
+
+	add             target_ptr_val, ptr_ciphertext
+
+
+	mov             tmp1, N_val
+	and             tmp1, (7 << 4)
+	jz              _initial_num_blocks_is_0
+
+	cmp             tmp1, (4 << 4)
+	je              _initial_num_blocks_is_4
+
+
+
+	cmp             tmp1, (6 << 4)
+	je              _initial_num_blocks_is_6
+
+	cmp             tmp1, (5 << 4)
+	je              _initial_num_blocks_is_5
+
+
+
+	cmp             tmp1, (3 << 4)
+	je              _initial_num_blocks_is_3
+
+	cmp             tmp1, (2 << 4)
+	je              _initial_num_blocks_is_2
+
+	cmp             tmp1, (1 << 4)
+	je              _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	add     ptr_ciphertext, 16*7
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	add     ptr_ciphertext, 16*6
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	add     ptr_ciphertext, 16*5
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	add     ptr_ciphertext, 16*4
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+
+_initial_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	add     ptr_ciphertext, 16*3
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+	vmovdqu  [ptr_ciphertext+16], xmm2
+	add     ptr_ciphertext, 16*2
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+	add     ptr_ciphertext, 16
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_0:
+		mov             twtempl, [TW+8*0]
+		mov             twtemph, [TW+8*1]
+		vmovdqa          xmm9, [TW+16*0]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*2], twtempl
+		mov             [TW+8*3], twtemph
+		vmovdqa          xmm10, [TW+16*1]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*4], twtempl
+		mov             [TW+8*5], twtemph
+		vmovdqa          xmm11, [TW+16*2]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*6], twtempl
+		mov             [TW+8*7], twtemph
+		vmovdqa          xmm12, [TW+16*3]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*8], twtempl
+		mov             [TW+8*9], twtemph
+		vmovdqa          xmm13, [TW+16*4]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*10], twtempl
+		mov             [TW+8*11], twtemph
+		vmovdqa          xmm14, [TW+16*5]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*12], twtempl
+		mov             [TW+8*13], twtemph
+		vmovdqa          xmm15, [TW+16*6]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*14], twtempl
+		mov             [TW+8*15], twtemph
+		;vmovdqa         xmm16, [TW+16*7]
+
+		cmp             ptr_ciphertext, target_ptr_val
+		je              _last_eight
+_main_loop:
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+
+	add     ptr_plaintext, 128
+
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	vmovdqu  [ptr_ciphertext+16*7], xmm8
+	add     ptr_ciphertext, 128
+
+	cmp     ptr_ciphertext, target_ptr_val
+	jne     _main_loop
+
+_last_eight:
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_final
+
+	; generate next Tweak value
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	vmovdqa  xmm1, [TW + 16*7]
+	vmovdqa  [TW + 16*0], xmm1       ; swap tweak values for cipher stealing for decrypt
+
+	mov     [TW + 16*7], twtempl
+	mov     [TW + 16*7+8], twtemph
+
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	jmp     _steal_cipher
+
+
+_done_final:
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+
+	jmp      _done
+
+
+_steal_cipher:
+	; start cipher stealing
+
+
+	vmovdqa  xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea     twtempl, [vpshufb_shf_table]
+	vmovdqu  xmm0, [twtempl+N_val]
+	vpshufb  xmm8, xmm0
+
+
+	vmovdqu  xmm3, [ptr_plaintext + 112 + N_val]      ; state register is temporarily xmm3 to eliminate a move
+	vmovdqu  [ptr_ciphertext + 112 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea     twtempl, [vpshufb_shf_table +16]
+	sub     twtempl, N_val
+	vmovdqu  xmm0, [twtempl]
+	vpxor    xmm0, [mask1]
+	vpshufb  xmm3, xmm0
+
+	vpblendvb       xmm3,  xmm3, xmm2, xmm0      ;xmm0 is implicit
+
+	; xor Tweak value
+	vmovdqa  xmm8, [TW]
+	vpxor    xmm8, xmm3      ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+	;encrypt last block with cipher stealing
+	vpxor    xmm8, [keys]                    ; ARK
+	vaesdec  xmm8, [keys + 16*1]             ; round 1
+	vaesdec  xmm8, [keys + 16*2]             ; round 2
+	vaesdec  xmm8, [keys + 16*3]             ; round 3
+	vaesdec  xmm8, [keys + 16*4]             ; round 4
+	vaesdec  xmm8, [keys + 16*5]             ; round 5
+	vaesdec  xmm8, [keys + 16*6]             ; round 6
+	vaesdec  xmm8, [keys + 16*7]             ; round 7
+	vaesdec  xmm8, [keys + 16*8]             ; round 8
+	vaesdec  xmm8, [keys + 16*9]             ; round 9
+	vaesdec  xmm8, [keys + 16*10]            ; round 9
+	vaesdec  xmm8, [keys + 16*11]            ; round 9
+	vaesdec  xmm8, [keys + 16*12]            ; round 9
+	vaesdec  xmm8, [keys + 16*13]            ; round 9
+	vaesdeclast      xmm8, [keys + 16*14]    ; round 10
+
+	; xor Tweak value
+	vpxor    xmm8, [TW]
+
+_done:
+	; store last ciphertext value
+	vmovdqu  [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+	mov     rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rdi, [_gpr + 8*1]
+	mov     rsi, [_gpr + 8*2]
+
+
+	vmovdqa  xmm6, [_xmm + 16*0]
+	vmovdqa  xmm7, [_xmm + 16*1]
+	vmovdqa  xmm8, [_xmm + 16*2]
+	vmovdqa  xmm9, [_xmm + 16*3]
+	vmovdqa  xmm10, [_xmm + 16*4]
+	vmovdqa  xmm11, [_xmm + 16*5]
+	vmovdqa  xmm12, [_xmm + 16*6]
+	vmovdqa  xmm13, [_xmm + 16*7]
+	vmovdqa  xmm14, [_xmm + 16*8]
+	vmovdqa  xmm15, [_xmm + 16*9]
+%endif
+
+	add     rsp, VARIABLE_OFFSET
+
+	ret
+
+
+
+
+
+_less_than_128_bytes:
+	cmp N_val, 16
+	jb _ret_
+
+	mov     tmp1, N_val
+	and     tmp1, (7 << 4)
+	cmp     tmp1, (6 << 4)
+	je      _num_blocks_is_6
+	cmp     tmp1, (5 << 4)
+	je      _num_blocks_is_5
+	cmp     tmp1, (4 << 4)
+	je      _num_blocks_is_4
+	cmp     tmp1, (3 << 4)
+	je      _num_blocks_is_3
+	cmp     tmp1, (2 << 4)
+	je      _num_blocks_is_2
+	cmp     tmp1, (1 << 4)
+	je      _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+	sub     ptr_plaintext, 16*1
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_7
+
+_steal_cipher_7:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm15
+	vmovdqa  xmm15, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	vmovdqa  xmm8, xmm7
+	jmp     _steal_cipher
+
+_done_7:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	vmovdqa  xmm8, xmm7
+	jmp     _done
+
+
+
+
+
+
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+	sub     ptr_plaintext, 16*2
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_6
+
+_steal_cipher_6:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm14
+	vmovdqa  xmm14, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	vmovdqa  xmm8, xmm6
+	jmp     _steal_cipher
+
+_done_6:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	vmovdqa  xmm8, xmm6
+	jmp     _done
+
+
+
+
+
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+	sub     ptr_plaintext, 16*3
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_5
+
+_steal_cipher_5:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm13
+	vmovdqa  xmm13, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	vmovdqa  xmm8, xmm5
+	jmp     _steal_cipher
+
+_done_5:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	vmovdqa  xmm8, xmm5
+	jmp     _done
+
+
+
+
+
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+	sub     ptr_plaintext, 16*4
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_4
+
+_steal_cipher_4:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm12
+	vmovdqa  xmm12, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	vmovdqa  xmm8, xmm4
+	jmp     _steal_cipher
+
+_done_4:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	vmovdqa  xmm8, xmm4
+	jmp     _done
+
+
+
+
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+	sub     ptr_plaintext, 16*5
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_3
+
+_steal_cipher_3:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm11
+	vmovdqa  xmm11, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	vmovdqa  xmm8, xmm3
+	jmp     _steal_cipher
+
+_done_3:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	vmovdqa  xmm8, xmm3
+	jmp     _done
+
+
+
+
+
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+	sub     ptr_plaintext, 16*6
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_2
+
+_steal_cipher_2:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm10
+	vmovdqa  xmm10, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	vmovdqa  xmm8, xmm2
+	jmp     _steal_cipher
+
+_done_2:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	vmovdqa  xmm8, xmm2
+	jmp     _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+	sub     ptr_plaintext, 16*7
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_1
+
+_steal_cipher_1:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	vmovdqa  [TW + 16*0] , xmm9
+	vmovdqa  xmm9, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	vmovdqa  xmm8, xmm1
+	jmp     _steal_cipher
+
+_done_1:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	vmovdqa  xmm8, xmm1
+	jmp     _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm
new file mode 100644
index 000000000..2e77e5e80
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm
@@ -0,0 +1,1898 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*23     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*23     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*33     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_expanded_key_sse(
+;               UINT8 *k2,      // key used for tweaking, 16*15 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*15 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *ct,        // ciphertext sector input data
+;               UINT8 *pt);     // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define target_ptr_val          rsi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define target_ptr_val          rdx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro  encrypt_T 8
+%define %%xkey2         %1
+%define %%xstate_tweak  %2
+%define %%xkey1         %3
+%define %%xraw_key      %4
+%define %%xtmp          %5
+%define %%ptr_key2      %6
+%define %%ptr_key1      %7
+%define %%ptr_expanded_keys     %8
+
+	movdqu  %%xkey2, [%%ptr_key2]
+	pxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*14]
+	movdqa  [%%ptr_expanded_keys+16*14], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*1]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 1 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*13]
+	movdqa  [%%ptr_expanded_keys+16*13], %%xkey1             ; store round keys in stack
+
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*2]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 2 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*12]
+	movdqa  [%%ptr_expanded_keys+16*12], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*3]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 3 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*11]
+	movdqa  [%%ptr_expanded_keys+16*11], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*4]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 4 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*10]
+	movdqa  [%%ptr_expanded_keys+16*10], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*5]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 5 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*9]
+	movdqa  [%%ptr_expanded_keys+16*9], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*6]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 6 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*8]
+	movdqa  [%%ptr_expanded_keys+16*8], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*7]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 7 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*7]
+	movdqa  [%%ptr_expanded_keys+16*7], %%xkey1             ; store round keys in stack
+
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*8]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 8 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*6]
+	movdqa  [%%ptr_expanded_keys+16*6], %%xkey1             ; store round keys in stack
+
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*9]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 9 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*5]
+	movdqa  [%%ptr_expanded_keys+16*5], %%xkey1             ; store round keys in stack
+
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*10]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 10 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*4]
+	movdqa  [%%ptr_expanded_keys+16*4], %%xkey1            ; store round keys in stack
+
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*11]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 11 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*3]
+	movdqa  [%%ptr_expanded_keys+16*3], %%xkey1            ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*12]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 12 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*2]
+	movdqa  [%%ptr_expanded_keys+16*2], %%xkey1            ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*13]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 13 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*1]
+	movdqa  [%%ptr_expanded_keys+16*1], %%xkey1            ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*14]
+	aesenclast      %%xstate_tweak, %%xkey2                 ; round 14 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*0]
+	movdqa  [%%ptr_expanded_keys+16*0], %%xkey1            ; store round keys in stack
+
+	movdqa  [TW], %%xstate_tweak                            ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		movdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		movdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		movdqa  %%TW2, [TW+16*1]
+		movdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		movdqa  %%TW3, [TW+16*2]
+		movdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		movdqa  %%TW4, [TW+16*3]
+		movdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		movdqa  %%TW5, [TW+16*4]
+		movdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		movdqa  %%TW6, [TW+16*5]
+		movdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		movdqa  %%TW7, [TW+16*6]
+		movdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro  encrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	pxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	movdqa  %%T0, [keys]
+	pxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	movdqa  %%T0, [keys + 16*1]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	movdqa  %%T0, [keys + 16*2]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	movdqa  %%T0, [keys + 16*3]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	movdqa  %%T0, [keys + 16*4]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	movdqa  %%T0, [keys + 16*5]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	movdqa  %%T0, [keys + 16*6]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	movdqa  %%T0, [keys + 16*7]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	movdqa  %%T0, [keys + 16*8]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	movdqa  %%T0, [keys + 16*9]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+	; round 10
+	movdqa  %%T0, [keys + 16*10]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+	; round 11
+	movdqa  %%T0, [keys + 16*11]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	; round 12
+	movdqa  %%T0, [keys + 16*12]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	; round 13
+	movdqa  %%T0, [keys + 16*13]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	; round 14
+	movdqa  %%T0, [keys + 16*14]
+	aesdeclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdeclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdeclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdeclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdeclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdeclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdeclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		movdqa  %%TW1, [TW + 16*0]
+		movdqa  %%TW2, [TW + 16*1]
+		movdqa  %%TW3, [TW + 16*2]
+		movdqa  %%TW4, [TW + 16*3]
+		movdqa  %%TW5, [TW + 16*4]
+		movdqa  %%TW6, [TW + 16*5]
+		movdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_eight 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%TW8   %16     ; tweak 8
+%define %%T0    %17     ; Temp register
+%define %%last_eight     %18
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+	pxor    %%ST2, %%TW2
+	pxor    %%ST3, %%TW3
+	pxor    %%ST4, %%TW4
+	pxor    %%ST5, %%TW5
+	pxor    %%ST6, %%TW6
+	pxor    %%ST7, %%TW7
+	pxor    %%ST8, %%TW8
+
+	; ARK
+	movdqa %%T0, [keys]
+	pxor    %%ST1, %%T0
+	pxor    %%ST2, %%T0
+	pxor    %%ST3, %%T0
+	pxor    %%ST4, %%T0
+	pxor    %%ST5, %%T0
+	pxor    %%ST6, %%T0
+	pxor    %%ST7, %%T0
+	pxor    %%ST8, %%T0
+
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 1
+	movdqa %%T0, [keys + 16*1]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 2
+	movdqa %%T0, [keys + 16*2]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+
+%endif
+	; round 3
+	movdqa %%T0, [keys + 16*3]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*2], twtempl
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 4
+	movdqa %%T0, [keys + 16*4]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl
+%endif
+	; round 5
+	movdqa %%T0, [keys + 16*5]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 6
+	movdqa %%T0, [keys + 16*6]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl
+		mov     [TW + 8*7], twtemph
+%endif
+	; round 7
+	movdqa %%T0, [keys + 16*7]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 8
+	movdqa %%T0, [keys + 16*8]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl
+		mov     [TW + 8*9], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 9
+	movdqa %%T0, [keys + 16*9]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+%endif
+	; round 10
+	movdqa %%T0, [keys + 16*10]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*10], twtempl
+		mov     [TW + 8*11], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 11
+	movdqa %%T0, [keys + 16*11]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl
+%endif
+	; round 12
+	movdqa %%T0, [keys + 16*12]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*13], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 13
+	movdqa %%T0, [keys + 16*13]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+;               mov     [TW + 8*14], twtempl
+;               mov     [TW + 8*15], twtemph
+%endif
+	; round 14
+	movdqa %%T0, [keys + 16*14]
+	aesdeclast      %%ST1, %%T0
+	aesdeclast      %%ST2, %%T0
+	aesdeclast      %%ST3, %%T0
+	aesdeclast      %%ST4, %%T0
+	aesdeclast      %%ST5, %%T0
+	aesdeclast      %%ST6, %%T0
+	aesdeclast      %%ST7, %%T0
+	aesdeclast      %%ST8, %%T0
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+	pxor    %%ST2, %%TW2
+	pxor    %%ST3, %%TW3
+	pxor    %%ST4, %%TW4
+	pxor    %%ST5, %%TW5
+	pxor    %%ST6, %%TW6
+	pxor    %%ST7, %%TW7
+	pxor    %%ST8, %%TW8
+
+		mov     [TW + 8*14], twtempl
+		mov     [TW + 8*15], twtemph
+		; load next Tweak values
+		movdqa  %%TW1, [TW + 16*0]
+		movdqa  %%TW2, [TW + 16*1]
+		movdqa  %%TW3, [TW + 16*2]
+		movdqa  %%TW4, [TW + 16*3]
+		movdqa  %%TW5, [TW + 16*4]
+		movdqa  %%TW6, [TW + 16*5]
+		movdqa  %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_dec_expanded_key_sse, function
+XTS_AES_256_dec_expanded_key_sse:
+	endbranch
+
+	sub     rsp, VARIABLE_OFFSET
+
+	mov     [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [_gpr + 8*1], rdi
+	mov     [_gpr + 8*2], rsi
+
+	movdqa  [_xmm + 16*0], xmm6
+	movdqa  [_xmm + 16*1], xmm7
+	movdqa  [_xmm + 16*2], xmm8
+	movdqa  [_xmm + 16*3], xmm9
+	movdqa  [_xmm + 16*4], xmm10
+	movdqa  [_xmm + 16*5], xmm11
+	movdqa  [_xmm + 16*6], xmm12
+	movdqa  [_xmm + 16*7], xmm13
+	movdqa  [_xmm + 16*8], xmm14
+	movdqa  [_xmm + 16*9], xmm15
+%endif
+
+	mov     ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	movdqu  xmm1, [T_val]                   ; read initial Tweak value
+	pxor    xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]            ; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]           ; ciphertext pointer
+%endif
+
+
+
+	mov             target_ptr_val, N_val
+	and             target_ptr_val, -16             ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+	sub             target_ptr_val, 128             ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+	jl              _less_than_128_bytes
+
+	add             target_ptr_val, ptr_ciphertext
+
+
+	mov             tmp1, N_val
+	and             tmp1, (7 << 4)
+	jz              _initial_num_blocks_is_0
+
+	cmp             tmp1, (4 << 4)
+	je              _initial_num_blocks_is_4
+
+
+
+	cmp             tmp1, (6 << 4)
+	je              _initial_num_blocks_is_6
+
+	cmp             tmp1, (5 << 4)
+	je              _initial_num_blocks_is_5
+
+
+
+	cmp             tmp1, (3 << 4)
+	je              _initial_num_blocks_is_3
+
+	cmp             tmp1, (2 << 4)
+	je              _initial_num_blocks_is_2
+
+	cmp             tmp1, (1 << 4)
+	je              _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	add     ptr_ciphertext, 16*7
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	add     ptr_ciphertext, 16*6
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	add     ptr_ciphertext, 16*5
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	add     ptr_ciphertext, 16*4
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+
+_initial_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	add     ptr_ciphertext, 16*3
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+	movdqu  [ptr_ciphertext+16], xmm2
+	add     ptr_ciphertext, 16*2
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+	add     ptr_ciphertext, 16
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_0:
+		mov             twtempl, [TW+8*0]
+		mov             twtemph, [TW+8*1]
+		movdqa          xmm9, [TW+16*0]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*2], twtempl
+		mov             [TW+8*3], twtemph
+		movdqa          xmm10, [TW+16*1]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*4], twtempl
+		mov             [TW+8*5], twtemph
+		movdqa          xmm11, [TW+16*2]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*6], twtempl
+		mov             [TW+8*7], twtemph
+		movdqa          xmm12, [TW+16*3]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*8], twtempl
+		mov             [TW+8*9], twtemph
+		movdqa          xmm13, [TW+16*4]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*10], twtempl
+		mov             [TW+8*11], twtemph
+		movdqa          xmm14, [TW+16*5]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*12], twtempl
+		mov             [TW+8*13], twtemph
+		movdqa          xmm15, [TW+16*6]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*14], twtempl
+		mov             [TW+8*15], twtemph
+		;movdqa         xmm16, [TW+16*7]
+
+		cmp             ptr_ciphertext, target_ptr_val
+		je              _last_eight
+_main_loop:
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+
+	add     ptr_plaintext, 128
+
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	movdqu  [ptr_ciphertext+16*7], xmm8
+	add     ptr_ciphertext, 128
+
+	cmp     ptr_ciphertext, target_ptr_val
+	jne     _main_loop
+
+_last_eight:
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_final
+
+	; generate next Tweak value
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	movdqa  xmm1, [TW + 16*7]
+	movdqa  [TW + 16*0], xmm1       ; swap tweak values for cipher stealing for decrypt
+
+	mov     [TW + 16*7], twtempl
+	mov     [TW + 16*7+8], twtemph
+
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	jmp     _steal_cipher
+
+
+_done_final:
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+
+	jmp      _done
+
+
+_steal_cipher:
+	; start cipher stealing
+
+
+	movdqa  xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea     twtempl, [pshufb_shf_table]
+	movdqu  xmm0, [twtempl+N_val]
+	pshufb  xmm8, xmm0
+
+
+	movdqu  xmm3, [ptr_plaintext + 112 + N_val]      ; state register is temporarily xmm3 to eliminate a move
+	movdqu  [ptr_ciphertext + 112 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea     twtempl, [pshufb_shf_table +16]
+	sub     twtempl, N_val
+	movdqu  xmm0, [twtempl]
+	pxor    xmm0, [mask1]
+	pshufb  xmm3, xmm0
+
+	pblendvb        xmm3, xmm2      ;xmm0 is implicit
+
+	; xor Tweak value
+	movdqa  xmm8, [TW]
+	pxor    xmm8, xmm3      ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+	;encrypt last block with cipher stealing
+	pxor    xmm8, [keys]                    ; ARK
+	aesdec  xmm8, [keys + 16*1]             ; round 1
+	aesdec  xmm8, [keys + 16*2]             ; round 2
+	aesdec  xmm8, [keys + 16*3]             ; round 3
+	aesdec  xmm8, [keys + 16*4]             ; round 4
+	aesdec  xmm8, [keys + 16*5]             ; round 5
+	aesdec  xmm8, [keys + 16*6]             ; round 6
+	aesdec  xmm8, [keys + 16*7]             ; round 7
+	aesdec  xmm8, [keys + 16*8]             ; round 8
+	aesdec  xmm8, [keys + 16*9]             ; round 9
+	aesdec  xmm8, [keys + 16*10]            ; round 9
+	aesdec  xmm8, [keys + 16*11]            ; round 9
+	aesdec  xmm8, [keys + 16*12]            ; round 9
+	aesdec  xmm8, [keys + 16*13]            ; round 9
+	aesdeclast      xmm8, [keys + 16*14]    ; round 10
+
+	; xor Tweak value
+	pxor    xmm8, [TW]
+
+_done:
+	; store last ciphertext value
+	movdqu  [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+	mov     rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rdi, [_gpr + 8*1]
+	mov     rsi, [_gpr + 8*2]
+
+
+	movdqa  xmm6, [_xmm + 16*0]
+	movdqa  xmm7, [_xmm + 16*1]
+	movdqa  xmm8, [_xmm + 16*2]
+	movdqa  xmm9, [_xmm + 16*3]
+	movdqa  xmm10, [_xmm + 16*4]
+	movdqa  xmm11, [_xmm + 16*5]
+	movdqa  xmm12, [_xmm + 16*6]
+	movdqa  xmm13, [_xmm + 16*7]
+	movdqa  xmm14, [_xmm + 16*8]
+	movdqa  xmm15, [_xmm + 16*9]
+%endif
+
+	add     rsp, VARIABLE_OFFSET
+
+	ret
+
+
+
+
+
+_less_than_128_bytes:
+	cmp N_val, 16
+	jb _ret_
+
+	mov     tmp1, N_val
+	and     tmp1, (7 << 4)
+	cmp     tmp1, (6 << 4)
+	je      _num_blocks_is_6
+	cmp     tmp1, (5 << 4)
+	je      _num_blocks_is_5
+	cmp     tmp1, (4 << 4)
+	je      _num_blocks_is_4
+	cmp     tmp1, (3 << 4)
+	je      _num_blocks_is_3
+	cmp     tmp1, (2 << 4)
+	je      _num_blocks_is_2
+	cmp     tmp1, (1 << 4)
+	je      _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+	sub     ptr_plaintext, 16*1
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_7
+
+_steal_cipher_7:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm15
+	movdqa  xmm15, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	movdqa  xmm8, xmm7
+	jmp     _steal_cipher
+
+_done_7:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	movdqa  xmm8, xmm7
+	jmp     _done
+
+
+
+
+
+
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+	sub     ptr_plaintext, 16*2
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_6
+
+_steal_cipher_6:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm14
+	movdqa  xmm14, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	movdqa  xmm8, xmm6
+	jmp     _steal_cipher
+
+_done_6:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	movdqa  xmm8, xmm6
+	jmp     _done
+
+
+
+
+
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+	sub     ptr_plaintext, 16*3
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_5
+
+_steal_cipher_5:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm13
+	movdqa  xmm13, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	movdqa  xmm8, xmm5
+	jmp     _steal_cipher
+
+_done_5:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	movdqa  xmm8, xmm5
+	jmp     _done
+
+
+
+
+
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+	sub     ptr_plaintext, 16*4
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_4
+
+_steal_cipher_4:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm12
+	movdqa  xmm12, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	movdqa  xmm8, xmm4
+	jmp     _steal_cipher
+
+_done_4:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	movdqa  xmm8, xmm4
+	jmp     _done
+
+
+
+
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+	sub     ptr_plaintext, 16*5
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_3
+
+_steal_cipher_3:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm11
+	movdqa  xmm11, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	movdqa  xmm8, xmm3
+	jmp     _steal_cipher
+
+_done_3:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	movdqa  xmm8, xmm3
+	jmp     _done
+
+
+
+
+
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+	sub     ptr_plaintext, 16*6
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_2
+
+_steal_cipher_2:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm10
+	movdqa  xmm10, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	movdqa  xmm8, xmm2
+	jmp     _steal_cipher
+
+_done_2:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	movdqa  xmm8, xmm2
+	jmp     _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+	sub     ptr_plaintext, 16*7
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_1
+
+_steal_cipher_1:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm9
+	movdqa  xmm9, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	movdqa  xmm8, xmm1
+	jmp     _steal_cipher
+
+_done_1:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	movdqa  xmm8, xmm1
+	jmp     _done
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_vaes.asm
new file mode 100644
index 000000000..69228c18c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_vaes.asm
@@ -0,0 +1,1808 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*23     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*23     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*33     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_expanded_key_vaes(
+;               UINT8 *k2,      // key used for tweaking, 16*15 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*15 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *ct,        // ciphertext sector input data
+;               UINT8 *pt);     // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+%define zpoly   zmm25
+
+; macro to encrypt the tweak value
+
+%macro  encrypt_T 8
+%define %%xkey2         %1
+%define %%xstate_tweak  %2
+%define %%xkey1         %3
+%define %%xraw_key      %4
+%define %%xtmp          %5
+%define %%ptr_key2      %6
+%define %%ptr_key1      %7
+%define %%ptr_expanded_keys     %8
+
+	vmovdqu  %%xkey2, [%%ptr_key2]
+	vpxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*14]
+	vmovdqa  [%%ptr_expanded_keys+16*14], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*1]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 1 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*13]
+	vmovdqa  [%%ptr_expanded_keys+16*13], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*2]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 2 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*12]
+	vmovdqa  [%%ptr_expanded_keys+16*12], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*3]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 3 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*11]
+	vmovdqa  [%%ptr_expanded_keys+16*11], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*4]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 4 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*10]
+	vmovdqa  [%%ptr_expanded_keys+16*10], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*5]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 5 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*9]
+	vmovdqa  [%%ptr_expanded_keys+16*9], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*6]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 6 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*8]
+	vmovdqa  [%%ptr_expanded_keys+16*8], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*7]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 7 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*7]
+	vmovdqa  [%%ptr_expanded_keys+16*7], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*8]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 8 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*6]
+	vmovdqa  [%%ptr_expanded_keys+16*6], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*9]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 9 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*5]
+	vmovdqa  [%%ptr_expanded_keys+16*5], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*10]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 10 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*4]
+	vmovdqa  [%%ptr_expanded_keys+16*4], %%xkey1            ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*11]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 11 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*3]
+	vmovdqa  [%%ptr_expanded_keys+16*3], %%xkey1            ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*12]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 12 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*2]
+	vmovdqa  [%%ptr_expanded_keys+16*2], %%xkey1            ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*13]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 13 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*1]
+	vmovdqa  [%%ptr_expanded_keys+16*1], %%xkey1            ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*14]
+	vaesenclast      %%xstate_tweak, %%xkey2                 ; round 14 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*0]
+	vmovdqa  [%%ptr_expanded_keys+16*0], %%xkey1            ; store round keys in stack
+
+	vmovdqa  [TW], %%xstate_tweak                            ; Store the encrypted Tweak value
+%endmacro
+
+
+; Original way to generate initial tweak values and load plaintext values
+; only used for small blocks
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		vmovdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		vmovdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		vmovdqa  %%TW2, [TW+16*1]
+		vmovdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		vmovdqa  %%TW3, [TW+16*2]
+		vmovdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		vmovdqa  %%TW4, [TW+16*3]
+		vmovdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		vmovdqa  %%TW5, [TW+16*4]
+		vmovdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		vmovdqa  %%TW6, [TW+16*5]
+		vmovdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		vmovdqa  %%TW7, [TW+16*6]
+		vmovdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+%endmacro
+
+
+; Original decrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
+; next 8 Tweak values can be generated
+%macro  decrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks decrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	vmovdqa  %%T0, [keys]
+	vpxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	vmovdqa  %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	vmovdqa  %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	vmovdqa  %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	vmovdqa  %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	vmovdqa  %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	vmovdqa  %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	vmovdqa  %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	vmovdqa  %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	vmovdqa  %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+	; round 10
+	vmovdqa  %%T0, [keys + 16*10]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	; round 11
+	vmovdqa  %%T0, [keys + 16*11]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	; round 12
+	vmovdqa  %%T0, [keys + 16*12]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	; round 13
+	vmovdqa  %%T0, [keys + 16*13]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	; round 14
+	vmovdqa  %%T0, [keys + 16*14]
+	vaesdeclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdeclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdeclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdeclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdeclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdeclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdeclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+
+
+; Decrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  decrypt_by_eight_zmm 6
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%TW1   %3      ; tweak 1
+%define %%TW2   %4      ; tweak 2
+%define %%T0    %5     ; Temp register
+%define %%last_eight     %6
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+
+	; ARK
+	vbroadcasti32x4 %%T0, [keys]
+	vpxorq    %%ST1, %%T0
+	vpxorq    %%ST2, %%T0
+
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW1, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm15, %%TW1, 1
+		vpxord		zmm15, zmm15, zmm14
+%endif
+	; round 1
+	vbroadcasti32x4 %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 2
+	vbroadcasti32x4 %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 3
+	vbroadcasti32x4 %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW2, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm16, %%TW2, 1
+		vpxord		zmm16, zmm16, zmm14
+%endif
+	; round 4
+	vbroadcasti32x4 %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 5
+	vbroadcasti32x4 %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 6
+	vbroadcasti32x4 %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 7
+	vbroadcasti32x4 %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 8
+	vbroadcasti32x4 %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 9
+	vbroadcasti32x4 %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 10
+	vbroadcasti32x4 %%T0, [keys + 16*10]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 11
+	vbroadcasti32x4 %%T0, [keys + 16*11]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 12
+	vbroadcasti32x4 %%T0, [keys + 16*12]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 13
+	vbroadcasti32x4 %%T0, [keys + 16*13]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 14
+	vbroadcasti32x4 %%T0, [keys + 16*14]
+	vaesdeclast      %%ST1, %%T0
+	vaesdeclast      %%ST2, %%T0
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+
+	; load next Tweak values
+	vmovdqa32  %%TW1, zmm15
+	vmovdqa32  %%TW2, zmm16
+%endmacro
+
+
+; Decrypt 16 blocks in parallel
+; generate next 8 tweak values
+%macro  decrypt_by_16_zmm 10
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+
+%define %%TW1   %5      ; tweak 1
+%define %%TW2   %6      ; tweak 2
+%define %%TW3   %7      ; tweak 3
+%define %%TW4   %8      ; tweak 4
+
+%define %%T0    %9     ; Temp register
+%define %%last_eight     %10
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+	vpxorq    %%ST3, %%TW3
+	vpxorq    %%ST4, %%TW4
+
+	; ARK
+	vbroadcasti32x4 %%T0, [keys]
+	vpxorq    %%ST1, %%T0
+	vpxorq    %%ST2, %%T0
+	vpxorq    %%ST3, %%T0
+	vpxorq    %%ST4, %%T0
+
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW3, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm15, %%TW3, 1
+		vpxord		zmm15, zmm15, zmm14
+%endif
+	; round 1
+	vbroadcasti32x4 %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 2
+	vbroadcasti32x4 %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 3
+	vbroadcasti32x4 %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW4, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm16, %%TW4, 1
+		vpxord		zmm16, zmm16, zmm14
+%endif
+	; round 4
+	vbroadcasti32x4 %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 5
+	vbroadcasti32x4 %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 6
+	vbroadcasti32x4 %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, zmm15, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm17, zmm15, 1
+		vpxord		zmm17, zmm17, zmm14
+%endif
+	; round 7
+	vbroadcasti32x4 %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 8
+	vbroadcasti32x4 %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 9
+	vbroadcasti32x4 %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, zmm16, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm18, zmm16, 1
+		vpxord		zmm18, zmm18, zmm14
+%endif
+	; round 10
+	vbroadcasti32x4 %%T0, [keys + 16*10]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 11
+	vbroadcasti32x4 %%T0, [keys + 16*11]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 12
+	vbroadcasti32x4 %%T0, [keys + 16*12]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 13
+	vbroadcasti32x4 %%T0, [keys + 16*13]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 14
+	vbroadcasti32x4 %%T0, [keys + 16*14]
+	vaesdeclast      %%ST1, %%T0
+	vaesdeclast      %%ST2, %%T0
+	vaesdeclast      %%ST3, %%T0
+	vaesdeclast      %%ST4, %%T0
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+	vpxorq    %%ST3, %%TW3
+	vpxorq    %%ST4, %%TW4
+
+	; load next Tweak values
+	vmovdqa32  %%TW1, zmm15
+	vmovdqa32  %%TW2, zmm16
+	vmovdqa32  %%TW3, zmm17
+	vmovdqa32  %%TW4, zmm18
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_dec_expanded_key_vaes, function
+XTS_AES_256_dec_expanded_key_vaes:
+	endbranch
+
+%define ALIGN_STACK
+%ifdef ALIGN_STACK
+	push		rbp
+	mov		rbp, rsp
+	sub		rsp, VARIABLE_OFFSET
+	and		rsp, ~63
+%else
+	sub		rsp, VARIABLE_OFFSET
+%endif
+
+	mov		[_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov		[_gpr + 8*1], rdi
+	mov		[_gpr + 8*2], rsi
+
+	vmovdqa		[_xmm + 16*0], xmm6
+	vmovdqa		[_xmm + 16*1], xmm7
+	vmovdqa		[_xmm + 16*2], xmm8
+	vmovdqa		[_xmm + 16*3], xmm9
+	vmovdqa		[_xmm + 16*4], xmm10
+	vmovdqa		[_xmm + 16*5], xmm11
+	vmovdqa		[_xmm + 16*6], xmm12
+	vmovdqa		[_xmm + 16*7], xmm13
+	vmovdqa		[_xmm + 16*8], xmm14
+	vmovdqa		[_xmm + 16*9], xmm15
+%endif
+
+	mov		ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	vmovdqu		xmm1, [T_val]                   ; read initial Tweak value
+	vpxor		xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]	; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]	; ciphertext pointer
+%endif
+
+	cmp		N_val, 128
+	jl              _less_than_128_bytes
+
+	vpbroadcastq	zpoly, ghash_poly_8b
+
+	cmp		N_val, 256
+	jge		_start_by16
+
+	cmp		N_val, 128
+	jge		_start_by8
+
+_do_n_blocks:
+	cmp		N_val, 0
+	je		_ret_
+
+	cmp		N_val, (7*16)
+	jge		_remaining_num_blocks_is_7
+
+	cmp		N_val, (6*16)
+	jge		_remaining_num_blocks_is_6
+
+	cmp		N_val, (5*16)
+	jge		_remaining_num_blocks_is_5
+
+	cmp		N_val, (4*16)
+	jge		_remaining_num_blocks_is_4
+
+	cmp		N_val, (3*16)
+	jge		_remaining_num_blocks_is_3
+
+	cmp		N_val, (2*16)
+	jge		_remaining_num_blocks_is_2
+
+	cmp		N_val, (1*16)
+	jge		_remaining_num_blocks_is_1
+
+;; _remaining_num_blocks_is_0:
+	vmovdqu		xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext - 16], xmm1
+	vmovdqa		xmm8, xmm1
+
+	; Calc previous tweak
+	mov		tmp1, 1
+	kmovq		k1, tmp1
+	vpsllq		xmm13, xmm9, 63
+	vpsraq		xmm14, xmm13, 63
+	vpandq		xmm5, xmm14, XWORD(zpoly)
+	vpxorq		xmm9 {k1}, xmm9, xmm5
+	vpsrldq		xmm10, xmm9, 8
+	vpshrdq		xmm0, xmm9, xmm10, 1
+	vpslldq		xmm13, xmm13, 8
+	vpxorq		xmm0, xmm0, xmm13
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_7:
+	mov		tmp1, -1
+	shr		tmp1, 16
+	kmovq		k1, tmp1
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2 {k1}, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*7
+	and		N_val, 15
+	je		_done_7_remain
+	vextracti32x4	xmm12, zmm10, 2
+	vextracti32x4	xmm13, zmm10, 3
+	vinserti32x4	zmm10, xmm13, 2
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4] {k1}, zmm2
+	add		ptr_ciphertext, 16*7
+	vextracti32x4	xmm8, zmm2, 0x2
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_7_remain:
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4] {k1}, zmm2
+	jmp		_ret_
+
+_remaining_num_blocks_is_6:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	ymm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*6
+	and		N_val, 15
+	je		_done_6_remain
+	vextracti32x4	xmm12, zmm10, 1
+	vextracti32x4	xmm13, zmm10, 2
+	vinserti32x4	zmm10, xmm13, 1
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], ymm2
+	add		ptr_ciphertext, 16*6
+	vextracti32x4	xmm8, zmm2, 0x1
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_6_remain:
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], ymm2
+	jmp		_ret_
+
+_remaining_num_blocks_is_5:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*5
+	and		N_val, 15
+	je		_done_5_remain
+	vmovdqa		xmm12, xmm10
+	vextracti32x4	xmm10, zmm10, 1
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu		[ptr_ciphertext+16*4], xmm2
+	add		ptr_ciphertext, 16*5
+	vmovdqa		xmm8, xmm2
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_5_remain:
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu		[ptr_ciphertext+16*4], xmm2
+	jmp		_ret_
+
+_remaining_num_blocks_is_4:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	add		ptr_plaintext, 16*4
+	and		N_val, 15
+	je		_done_4_remain
+	vextracti32x4	xmm12, zmm9, 3
+	vinserti32x4	zmm9, xmm10, 3
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	add		ptr_ciphertext, 16*4
+	vextracti32x4	xmm8, zmm1, 0x3
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_4_remain:
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	jmp		_ret_
+
+_remaining_num_blocks_is_3:
+	vmovdqu		xmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*1]
+	vmovdqu		xmm3, [ptr_plaintext+16*2]
+	add		ptr_plaintext, 16*3
+	and		N_val, 15
+	je		_done_3_remain
+	vextracti32x4	xmm13, zmm9, 2
+	vextracti32x4	xmm10, zmm9, 1
+	vextracti32x4	xmm11, zmm9, 3
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	add		ptr_ciphertext, 16*3
+	vmovdqa		xmm8, xmm3
+	vmovdqa		xmm0, xmm13
+	jmp		_steal_cipher
+_done_3_remain:
+	vextracti32x4	xmm10, zmm9, 1
+	vextracti32x4	xmm11, zmm9, 2
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	jmp		_ret_
+
+_remaining_num_blocks_is_2:
+	vmovdqu		xmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*1]
+	add		ptr_plaintext, 16*2
+	and		N_val, 15
+	je		_done_2_remain
+	vextracti32x4	xmm10, zmm9, 2
+	vextracti32x4	xmm12, zmm9, 1
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	add		ptr_ciphertext, 16*2
+	vmovdqa		xmm8, xmm2
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_2_remain:
+	vextracti32x4	xmm10, zmm9, 1
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	jmp		_ret_
+
+_remaining_num_blocks_is_1:
+	vmovdqu		xmm1, [ptr_plaintext]
+	add		ptr_plaintext, 16
+	and		N_val, 15
+	je		_done_1_remain
+	vextracti32x4	xmm11, zmm9, 1
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16
+	vmovdqa		xmm8, xmm1
+	vmovdqa		xmm0, xmm9
+	jmp		_steal_cipher
+_done_1_remain:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	jmp		_ret_
+
+
+
+_start_by16:
+	; Make first 7 tweek values
+	vbroadcasti32x4	zmm0, [TW]
+	vbroadcasti32x4	zmm8, [shufb_15_7]
+	mov		tmp1, 0xaa
+	kmovq		k2, tmp1
+
+	; Mult tweak by 2^{3, 2, 1, 0}
+	vpshufb		zmm1, zmm0, zmm8		; mov 15->0, 7->8
+	vpsllvq		zmm4, zmm0, [const_dq3210]	; shift l 3,2,1,0
+	vpsrlvq		zmm2, zmm1, [const_dq5678]	; shift r 5,6,7,8
+	vpclmulqdq      zmm3, zmm2, zpoly, 0x00
+	vpxorq		zmm4 {k2}, zmm4, zmm2		; tweaks shifted by 3-0
+	vpxord		zmm9, zmm3, zmm4
+
+	; Mult tweak by 2^{7, 6, 5, 4}
+	vpsllvq		zmm5, zmm0, [const_dq7654]	; shift l 7,6,5,4
+	vpsrlvq		zmm6, zmm1, [const_dq1234]	; shift r 1,2,3,4
+	vpclmulqdq      zmm7, zmm6, zpoly, 0x00
+	vpxorq		zmm5 {k2}, zmm5, zmm6		; tweaks shifted by 7-4
+	vpxord		zmm10, zmm7, zmm5
+
+	; Make next 8 tweek values by all x 2^8
+	vpsrldq		zmm13, zmm9, 15
+	vpclmulqdq	zmm14, zmm13, zpoly, 0
+	vpslldq		zmm11, zmm9, 1
+	vpxord		zmm11, zmm11, zmm14
+
+	vpsrldq		zmm15, zmm10, 15
+	vpclmulqdq	zmm16, zmm15, zpoly, 0
+	vpslldq		zmm12, zmm10, 1
+	vpxord		zmm12, zmm12, zmm16
+
+_main_loop_run_16:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2, [ptr_plaintext+16*4]
+	vmovdqu8	zmm3, [ptr_plaintext+16*8]
+	vmovdqu8	zmm4, [ptr_plaintext+16*12]
+	add		ptr_plaintext, 256
+
+	decrypt_by_16_zmm  zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
+
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], zmm2
+	vmovdqu8	[ptr_ciphertext+16*8], zmm3
+	vmovdqu8	[ptr_ciphertext+16*12], zmm4
+	add		ptr_ciphertext, 256
+	sub		N_val, 256
+	cmp		N_val, 256
+	jge		_main_loop_run_16
+
+	cmp		N_val, 128
+	jge		_main_loop_run_8
+
+	jmp		_do_n_blocks
+
+_start_by8:
+	; Make first 7 tweek values
+	vbroadcasti32x4	zmm0, [TW]
+	vbroadcasti32x4	zmm8, [shufb_15_7]
+	mov		tmp1, 0xaa
+	kmovq		k2, tmp1
+
+	; Mult tweak by 2^{3, 2, 1, 0}
+	vpshufb		zmm1, zmm0, zmm8		; mov 15->0, 7->8
+	vpsllvq		zmm4, zmm0, [const_dq3210]	; shift l 3,2,1,0
+	vpsrlvq		zmm2, zmm1, [const_dq5678]	; shift r 5,6,7,8
+	vpclmulqdq      zmm3, zmm2, zpoly, 0x00
+	vpxorq		zmm4 {k2}, zmm4, zmm2		; tweaks shifted by 3-0
+	vpxord		zmm9, zmm3, zmm4
+
+	; Mult tweak by 2^{7, 6, 5, 4}
+	vpsllvq		zmm5, zmm0, [const_dq7654]	; shift l 7,6,5,4
+	vpsrlvq		zmm6, zmm1, [const_dq1234]	; shift r 1,2,3,4
+	vpclmulqdq      zmm7, zmm6, zpoly, 0x00
+	vpxorq		zmm5 {k2}, zmm5, zmm6		; tweaks shifted by 7-4
+	vpxord		zmm10, zmm7, zmm5
+
+_main_loop_run_8:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 128
+
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 0
+
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], zmm2
+	add		ptr_ciphertext, 128
+	sub		N_val, 128
+	cmp		N_val, 128
+	jge		_main_loop_run_8
+
+	jmp		_do_n_blocks
+
+_steal_cipher:
+	; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
+	vmovdqa		xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea		twtempl, [vpshufb_shf_table]
+	vmovdqu		xmm10, [twtempl+N_val]
+	vpshufb		xmm8, xmm10
+
+	vmovdqu		xmm3, [ptr_plaintext - 16 + N_val]
+	vmovdqu		[ptr_ciphertext - 16 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea		twtempl, [vpshufb_shf_table +16]
+	sub		twtempl, N_val
+	vmovdqu		xmm10, [twtempl]
+	vpxor		xmm10, [mask1]
+	vpshufb		xmm3, xmm10
+
+	vpblendvb	xmm3, xmm3, xmm2, xmm10
+
+	; xor Tweak value
+	vpxor		xmm8, xmm3, xmm0
+
+	;decrypt last block with cipher stealing
+	vpxor		xmm8, [keys]		; ARK
+	vaesdec		xmm8, [keys + 16*1]	; round 1
+	vaesdec		xmm8, [keys + 16*2]	; round 2
+	vaesdec		xmm8, [keys + 16*3]	; round 3
+	vaesdec		xmm8, [keys + 16*4]	; round 4
+	vaesdec		xmm8, [keys + 16*5]	; round 5
+	vaesdec		xmm8, [keys + 16*6]	; round 6
+	vaesdec		xmm8, [keys + 16*7]	; round 7
+	vaesdec		xmm8, [keys + 16*8]	; round 8
+	vaesdec		xmm8, [keys + 16*9]	; round 9
+	vaesdec		xmm8, [keys + 16*10]	; round 9
+	vaesdec		xmm8, [keys + 16*11]	; round 9
+	vaesdec		xmm8, [keys + 16*12]	; round 9
+	vaesdec		xmm8, [keys + 16*13]	; round 9
+	vaesdeclast	xmm8, [keys + 16*14]	; round 10
+
+	; xor Tweak value
+	vpxor		xmm8, xmm8, xmm0
+
+_done:
+	; store last ciphertext value
+	vmovdqu		[ptr_ciphertext - 16], xmm8
+
+_ret_:
+	mov		rbx, [_gpr + 8*0]
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov		rdi, [_gpr + 8*1]
+	mov		rsi, [_gpr + 8*2]
+
+	vmovdqa		xmm6, [_xmm + 16*0]
+	vmovdqa		xmm7, [_xmm + 16*1]
+	vmovdqa		xmm8, [_xmm + 16*2]
+	vmovdqa		xmm9, [_xmm + 16*3]
+	vmovdqa		xmm10, [_xmm + 16*4]
+	vmovdqa		xmm11, [_xmm + 16*5]
+	vmovdqa		xmm12, [_xmm + 16*6]
+	vmovdqa		xmm13, [_xmm + 16*7]
+	vmovdqa		xmm14, [_xmm + 16*8]
+	vmovdqa		xmm15, [_xmm + 16*9]
+%endif
+
+%ifndef ALIGN_STACK
+	add		rsp, VARIABLE_OFFSET
+%else
+	mov		rsp, rbp
+	pop		rbp
+%endif
+	ret
+
+
+_less_than_128_bytes:
+	cmp		N_val, 16
+	jb		_ret_
+
+	mov		tmp1, N_val
+	and		tmp1, (7 << 4)
+	cmp		tmp1, (6 << 4)
+	je		_num_blocks_is_6
+	cmp		tmp1, (5 << 4)
+	je		_num_blocks_is_5
+	cmp		tmp1, (4 << 4)
+	je		_num_blocks_is_4
+	cmp		tmp1, (3 << 4)
+	je		_num_blocks_is_3
+	cmp		tmp1, (2 << 4)
+	je		_num_blocks_is_2
+	cmp		tmp1, (1 << 4)
+	je		_num_blocks_is_1
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add		ptr_plaintext, 16*7
+	and		N_val, 15
+	je		_done_7
+
+_steal_cipher_7:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa64	xmm16, xmm15
+	vmovdqa		xmm15, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+	vmovdqu		[ptr_ciphertext+16*5], xmm6
+	add		ptr_ciphertext, 16*7
+	vmovdqa64	xmm0, xmm16
+	vmovdqa		xmm8, xmm7
+	jmp		_steal_cipher
+
+_done_7:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+	vmovdqu		[ptr_ciphertext+16*5], xmm6
+	add		ptr_ciphertext, 16*7
+	vmovdqa		xmm8, xmm7
+	jmp		_done
+
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add		ptr_plaintext, 16*6
+	and		N_val, 15
+	je		 _done_6
+
+_steal_cipher_6:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm15, xmm14
+	vmovdqa		xmm14, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	add		ptr_ciphertext, 16*6
+	vmovdqa		xmm0, xmm15
+	vmovdqa		xmm8, xmm6
+	jmp		_steal_cipher
+
+_done_6:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	add		ptr_ciphertext, 16*6
+	vmovdqa		xmm8, xmm6
+	jmp		_done
+
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add		ptr_plaintext, 16*5
+	and		N_val, 15
+	je		_done_5
+
+_steal_cipher_5:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm14, xmm13
+	vmovdqa		xmm13, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	add		ptr_ciphertext, 16*5
+	vmovdqa		xmm0, xmm14
+	vmovdqa		xmm8, xmm5
+	jmp		_steal_cipher
+
+_done_5:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	add		ptr_ciphertext, 16*5
+	vmovdqa		xmm8, xmm5
+	jmp		_done
+
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add		ptr_plaintext, 16*4
+	and		N_val, 15
+	je		_done_4
+
+_steal_cipher_4:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm13, xmm12
+	vmovdqa		xmm12, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	add		ptr_ciphertext, 16*4
+	vmovdqa		xmm0, xmm13
+	vmovdqa		xmm8, xmm4
+	jmp		_steal_cipher
+
+_done_4:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	add		ptr_ciphertext, 16*4
+	vmovdqa		xmm8, xmm4
+	jmp		_done
+
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add		ptr_plaintext, 16*3
+	and		N_val, 15
+	je		_done_3
+
+_steal_cipher_3:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm12, xmm11
+	vmovdqa		xmm11, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	add		ptr_ciphertext, 16*3
+	vmovdqa		xmm0, xmm12
+	vmovdqa		xmm8, xmm3
+	jmp		_steal_cipher
+
+_done_3:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	add		ptr_ciphertext, 16*3
+	vmovdqa		xmm8, xmm3
+	jmp		_done
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add		ptr_plaintext, 16*2
+	and		N_val, 15
+	je		_done_2
+
+_steal_cipher_2:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm11, xmm10
+	vmovdqa		xmm10, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16*2
+	vmovdqa		xmm0, xmm11
+	vmovdqa		xmm8, xmm2
+	jmp		_steal_cipher
+
+_done_2:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16*2
+	vmovdqa		xmm8, xmm2
+	jmp		_done
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add		ptr_plaintext, 16*1
+	and		N_val, 15
+	je		_done_1
+
+_steal_cipher_1:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm10, xmm9
+	vmovdqa		xmm9, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	add		ptr_ciphertext, 16*1
+	vmovdqa		xmm0, xmm10
+	vmovdqa		xmm8, xmm1
+	jmp		_steal_cipher
+
+_done_1:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	add		ptr_ciphertext, 16*1
+	vmovdqa		xmm8, xmm1
+	jmp		_done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
+const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
+const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
+const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
+
+shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+%else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_XTS_AES_256_dec_expanded_key_vaes
+no_XTS_AES_256_dec_expanded_key_vaes:
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm
new file mode 100644
index 000000000..3904c8a54
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm
@@ -0,0 +1,1963 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*23     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*23     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*33     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_sse(
+;               UINT8 *k2,      // key used for tweaking, 16*2 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*2 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *ct,        // ciphertext sector input data
+;               UINT8 *pt);     // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define target_ptr_val          rsi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define target_ptr_val          rdx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of aeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro  key_expansion_256_flip  3
+%define %%xraw_key      %1
+%define %%xtmp  %2
+%define %%xround_key    %3
+	pshufd  %%xraw_key,  %%xraw_key, 11111111b
+	shufps  %%xtmp, %%xround_key, 00010000b
+	pxor    %%xround_key, %%xtmp
+	shufps  %%xtmp, %%xround_key, 10001100b
+	pxor    %%xround_key, %%xtmp
+	pxor    %%xround_key,  %%xraw_key
+%endmacro
+
+%macro  key_expansion_256_flop  3
+%define %%xraw_key      %1
+%define %%xtmp  %2
+%define %%xround_key    %3
+	pshufd  %%xraw_key,  %%xraw_key, 10101010b
+	shufps  %%xtmp, %%xround_key, 00010000b
+	pxor    %%xround_key, %%xtmp
+	shufps  %%xtmp, %%xround_key, 10001100b
+	pxor    %%xround_key, %%xtmp
+	pxor    %%xround_key,  %%xraw_key
+%endmacro
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 11
+%define %%xkey2         %1
+%define %%xkey2_2       %2
+%define %%xstate_tweak  %3
+%define %%xkey1         %4
+%define %%xkey1_2       %5
+%define %%xraw_key      %6
+%define %%xtmp          %7
+%define %%xtmp2         %8
+%define %%ptr_key2      %9
+%define %%ptr_key1      %10
+%define %%ptr_expanded_keys     %11
+
+
+	movdqu  %%xkey2, [%%ptr_key2]
+	pxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1]
+	movdqa  [%%ptr_expanded_keys+16*14], %%xkey1
+
+	movdqu  %%xkey2_2, [%%ptr_key2 + 16*1]
+	aesenc  %%xstate_tweak, %%xkey2_2                       ; round 1 for tweak encryption
+
+	movdqu  %%xkey1_2, [%%ptr_key1 + 16*1]
+	aesimc  %%xtmp2, %%xkey1_2
+	movdqa  [%%ptr_expanded_keys+16*13], %%xtmp2
+
+
+
+
+	aeskeygenassist         %%xraw_key, %%xkey2_2, 0x1      ; Generating round key 2 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1_2, 0x1      ; Generating round key 2 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 2 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1
+	movdqa                  [%%ptr_expanded_keys+16*12], %%xtmp2
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x1        ; Generating round key 3 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x1        ; Generating round key 3 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	aesenc                  %%xstate_tweak, %%xkey2_2       ; round 3 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1_2
+	movdqa                  [%%ptr_expanded_keys+16*11], %%xtmp2
+
+
+
+	aeskeygenassist         %%xraw_key, %%xkey2_2, 0x2      ; Generating round key 4 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1_2, 0x2      ; Generating round key 4 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 4 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1
+	movdqa                  [%%ptr_expanded_keys+16*10], %%xtmp2
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x2        ; Generating round key 5 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x2        ; Generating round key 5 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	aesenc                  %%xstate_tweak, %%xkey2_2       ; round 5 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1_2
+	movdqa                  [%%ptr_expanded_keys+16*9], %%xtmp2
+
+
+
+	aeskeygenassist         %%xraw_key, %%xkey2_2, 0x4      ; Generating round key 6 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1_2, 0x4      ; Generating round key 6 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 6 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1
+	movdqa                  [%%ptr_expanded_keys+16*8], %%xtmp2
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x4        ; Generating round key 7 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x4        ; Generating round key 7 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	aesenc                  %%xstate_tweak, %%xkey2_2       ; round 7 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1_2
+	movdqa                  [%%ptr_expanded_keys+16*7], %%xtmp2
+
+
+	aeskeygenassist         %%xraw_key, %%xkey2_2, 0x8      ; Generating round key 8 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1_2, 0x8      ; Generating round key 8 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 8 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1
+	movdqa                  [%%ptr_expanded_keys+16*6], %%xtmp2
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x8        ; Generating round key 9 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x8        ; Generating round key 9 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	aesenc                  %%xstate_tweak, %%xkey2_2       ; round 9 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1_2
+	movdqa                  [%%ptr_expanded_keys+16*5], %%xtmp2
+
+
+	aeskeygenassist         %%xraw_key, %%xkey2_2, 0x10     ; Generating round key 10 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1_2, 0x10     ; Generating round key 10 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 10 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1
+	movdqa                  [%%ptr_expanded_keys+16*4], %%xtmp2
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x10       ; Generating round key 11 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x10       ; Generating round key 11 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	aesenc                  %%xstate_tweak, %%xkey2_2       ; round 11 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1_2
+	movdqa                  [%%ptr_expanded_keys+16*3], %%xtmp2
+
+
+	aeskeygenassist         %%xraw_key, %%xkey2_2, 0x20     ; Generating round key 12 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1_2, 0x20     ; Generating round key 12 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 12 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1
+	movdqa                  [%%ptr_expanded_keys+16*2], %%xtmp2
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x20       ; Generating round key 13 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x20       ; Generating round key 13 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	aesenc                  %%xstate_tweak, %%xkey2_2       ; round 13 for tweak encryption
+	aesimc                  %%xtmp2, %%xkey1_2
+	movdqa                  [%%ptr_expanded_keys+16*1], %%xtmp2
+
+
+	aeskeygenassist         %%xraw_key, %%xkey2_2, 0x40     ; Generating round key 14 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1_2, 0x40     ; Generating round key 14 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	aesenclast              %%xstate_tweak, %%xkey2         ; round 14 for tweak encryption
+	movdqa                  [%%ptr_expanded_keys+16*0], %%xkey1
+
+	movdqa  [TW], %%xstate_tweak    ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		movdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		movdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		movdqa  %%TW2, [TW+16*1]
+		movdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		movdqa  %%TW3, [TW+16*2]
+		movdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		movdqa  %%TW4, [TW+16*3]
+		movdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		movdqa  %%TW5, [TW+16*4]
+		movdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		movdqa  %%TW6, [TW+16*5]
+		movdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		movdqa  %%TW7, [TW+16*6]
+		movdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro  encrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	pxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	movdqa  %%T0, [keys]
+	pxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	movdqa  %%T0, [keys + 16*1]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	movdqa  %%T0, [keys + 16*2]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	movdqa  %%T0, [keys + 16*3]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	movdqa  %%T0, [keys + 16*4]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	movdqa  %%T0, [keys + 16*5]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	movdqa  %%T0, [keys + 16*6]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	movdqa  %%T0, [keys + 16*7]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	movdqa  %%T0, [keys + 16*8]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	movdqa  %%T0, [keys + 16*9]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+	; round 10
+	movdqa  %%T0, [keys + 16*10]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+	; round 11
+	movdqa  %%T0, [keys + 16*11]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	; round 12
+	movdqa  %%T0, [keys + 16*12]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	; round 13
+	movdqa  %%T0, [keys + 16*13]
+	aesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdec  %%ST7, %%T0
+%endif
+
+	; round 14
+	movdqa  %%T0, [keys + 16*14]
+	aesdeclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesdeclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesdeclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesdeclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesdeclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesdeclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesdeclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		movdqa  %%TW1, [TW + 16*0]
+		movdqa  %%TW2, [TW + 16*1]
+		movdqa  %%TW3, [TW + 16*2]
+		movdqa  %%TW4, [TW + 16*3]
+		movdqa  %%TW5, [TW + 16*4]
+		movdqa  %%TW6, [TW + 16*5]
+		movdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_eight 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%TW8   %16     ; tweak 8
+%define %%T0    %17     ; Temp register
+%define %%last_eight     %18
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+	pxor    %%ST2, %%TW2
+	pxor    %%ST3, %%TW3
+	pxor    %%ST4, %%TW4
+	pxor    %%ST5, %%TW5
+	pxor    %%ST6, %%TW6
+	pxor    %%ST7, %%TW7
+	pxor    %%ST8, %%TW8
+
+	; ARK
+	movdqa %%T0, [keys]
+	pxor    %%ST1, %%T0
+	pxor    %%ST2, %%T0
+	pxor    %%ST3, %%T0
+	pxor    %%ST4, %%T0
+	pxor    %%ST5, %%T0
+	pxor    %%ST6, %%T0
+	pxor    %%ST7, %%T0
+	pxor    %%ST8, %%T0
+
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 1
+	movdqa %%T0, [keys + 16*1]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 2
+	movdqa %%T0, [keys + 16*2]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+
+%endif
+	; round 3
+	movdqa %%T0, [keys + 16*3]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*2], twtempl
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 4
+	movdqa %%T0, [keys + 16*4]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl
+%endif
+	; round 5
+	movdqa %%T0, [keys + 16*5]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 6
+	movdqa %%T0, [keys + 16*6]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl
+		mov     [TW + 8*7], twtemph
+%endif
+	; round 7
+	movdqa %%T0, [keys + 16*7]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 8
+	movdqa %%T0, [keys + 16*8]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl
+		mov     [TW + 8*9], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 9
+	movdqa %%T0, [keys + 16*9]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+%endif
+	; round 10
+	movdqa %%T0, [keys + 16*10]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*10], twtempl
+		mov     [TW + 8*11], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 11
+	movdqa %%T0, [keys + 16*11]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl
+%endif
+	; round 12
+	movdqa %%T0, [keys + 16*12]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*13], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 13
+	movdqa %%T0, [keys + 16*13]
+	aesdec  %%ST1, %%T0
+	aesdec  %%ST2, %%T0
+	aesdec  %%ST3, %%T0
+	aesdec  %%ST4, %%T0
+	aesdec  %%ST5, %%T0
+	aesdec  %%ST6, %%T0
+	aesdec  %%ST7, %%T0
+	aesdec  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+;               mov     [TW + 8*14], twtempl
+;               mov     [TW + 8*15], twtemph
+%endif
+	; round 14
+	movdqa %%T0, [keys + 16*14]
+	aesdeclast      %%ST1, %%T0
+	aesdeclast      %%ST2, %%T0
+	aesdeclast      %%ST3, %%T0
+	aesdeclast      %%ST4, %%T0
+	aesdeclast      %%ST5, %%T0
+	aesdeclast      %%ST6, %%T0
+	aesdeclast      %%ST7, %%T0
+	aesdeclast      %%ST8, %%T0
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+	pxor    %%ST2, %%TW2
+	pxor    %%ST3, %%TW3
+	pxor    %%ST4, %%TW4
+	pxor    %%ST5, %%TW5
+	pxor    %%ST6, %%TW6
+	pxor    %%ST7, %%TW7
+	pxor    %%ST8, %%TW8
+
+		mov     [TW + 8*14], twtempl
+		mov     [TW + 8*15], twtemph
+		; load next Tweak values
+		movdqa  %%TW1, [TW + 16*0]
+		movdqa  %%TW2, [TW + 16*1]
+		movdqa  %%TW3, [TW + 16*2]
+		movdqa  %%TW4, [TW + 16*3]
+		movdqa  %%TW5, [TW + 16*4]
+		movdqa  %%TW6, [TW + 16*5]
+		movdqa  %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_dec_sse, function
+XTS_AES_256_dec_sse:
+	endbranch
+
+	sub     rsp, VARIABLE_OFFSET
+
+	mov     [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [_gpr + 8*1], rdi
+	mov     [_gpr + 8*2], rsi
+
+	movdqa  [_xmm + 16*0], xmm6
+	movdqa  [_xmm + 16*1], xmm7
+	movdqa  [_xmm + 16*2], xmm8
+	movdqa  [_xmm + 16*3], xmm9
+	movdqa  [_xmm + 16*4], xmm10
+	movdqa  [_xmm + 16*5], xmm11
+	movdqa  [_xmm + 16*6], xmm12
+	movdqa  [_xmm + 16*7], xmm13
+	movdqa  [_xmm + 16*8], xmm14
+	movdqa  [_xmm + 16*9], xmm15
+%endif
+
+	mov     ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	movdqu  xmm1, [T_val]                   ; read initial Tweak value
+	pxor    xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, xmm7, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]            ; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]           ; ciphertext pointer
+%endif
+
+
+
+	mov             target_ptr_val, N_val
+	and             target_ptr_val, -16             ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+	sub             target_ptr_val, 128             ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+	jl              _less_than_128_bytes
+
+	add             target_ptr_val, ptr_ciphertext
+
+
+	mov             tmp1, N_val
+	and             tmp1, (7 << 4)
+	jz              _initial_num_blocks_is_0
+
+	cmp             tmp1, (4 << 4)
+	je              _initial_num_blocks_is_4
+
+
+
+	cmp             tmp1, (6 << 4)
+	je              _initial_num_blocks_is_6
+
+	cmp             tmp1, (5 << 4)
+	je              _initial_num_blocks_is_5
+
+
+
+	cmp             tmp1, (3 << 4)
+	je              _initial_num_blocks_is_3
+
+	cmp             tmp1, (2 << 4)
+	je              _initial_num_blocks_is_2
+
+	cmp             tmp1, (1 << 4)
+	je              _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	add     ptr_ciphertext, 16*7
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	add     ptr_ciphertext, 16*6
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	add     ptr_ciphertext, 16*5
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	add     ptr_ciphertext, 16*4
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+
+_initial_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	add     ptr_ciphertext, 16*3
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+	movdqu  [ptr_ciphertext+16], xmm2
+	add     ptr_ciphertext, 16*2
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+	add     ptr_ciphertext, 16
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_0:
+		mov             twtempl, [TW+8*0]
+		mov             twtemph, [TW+8*1]
+		movdqa          xmm9, [TW+16*0]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*2], twtempl
+		mov             [TW+8*3], twtemph
+		movdqa          xmm10, [TW+16*1]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*4], twtempl
+		mov             [TW+8*5], twtemph
+		movdqa          xmm11, [TW+16*2]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*6], twtempl
+		mov             [TW+8*7], twtemph
+		movdqa          xmm12, [TW+16*3]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*8], twtempl
+		mov             [TW+8*9], twtemph
+		movdqa          xmm13, [TW+16*4]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*10], twtempl
+		mov             [TW+8*11], twtemph
+		movdqa          xmm14, [TW+16*5]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*12], twtempl
+		mov             [TW+8*13], twtemph
+		movdqa          xmm15, [TW+16*6]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*14], twtempl
+		mov             [TW+8*15], twtemph
+		;movdqa         xmm16, [TW+16*7]
+
+		cmp             ptr_ciphertext, target_ptr_val
+		je              _last_eight
+_main_loop:
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+
+	add     ptr_plaintext, 128
+
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	movdqu  [ptr_ciphertext+16*7], xmm8
+	add     ptr_ciphertext, 128
+
+	cmp     ptr_ciphertext, target_ptr_val
+	jne     _main_loop
+
+_last_eight:
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_final
+
+	; generate next Tweak value
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	movdqa  xmm1, [TW + 16*7]
+	movdqa  [TW + 16*0], xmm1       ; swap tweak values for cipher stealing for decrypt
+
+	mov     [TW + 16*7], twtempl
+	mov     [TW + 16*7+8], twtemph
+
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	jmp     _steal_cipher
+
+
+_done_final:
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+
+	jmp      _done
+
+
+_steal_cipher:
+	; start cipher stealing
+
+
+	movdqa  xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea     twtempl, [pshufb_shf_table]
+	movdqu  xmm0, [twtempl+N_val]
+	pshufb  xmm8, xmm0
+
+
+	movdqu  xmm3, [ptr_plaintext + 112 + N_val]      ; state register is temporarily xmm3 to eliminate a move
+	movdqu  [ptr_ciphertext + 112 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea     twtempl, [pshufb_shf_table +16]
+	sub     twtempl, N_val
+	movdqu  xmm0, [twtempl]
+	pxor    xmm0, [mask1]
+	pshufb  xmm3, xmm0
+
+	pblendvb        xmm3, xmm2      ;xmm0 is implicit
+
+	; xor Tweak value
+	movdqa  xmm8, [TW]
+	pxor    xmm8, xmm3      ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+	;encrypt last block with cipher stealing
+	pxor    xmm8, [keys]                    ; ARK
+	aesdec  xmm8, [keys + 16*1]             ; round 1
+	aesdec  xmm8, [keys + 16*2]             ; round 2
+	aesdec  xmm8, [keys + 16*3]             ; round 3
+	aesdec  xmm8, [keys + 16*4]             ; round 4
+	aesdec  xmm8, [keys + 16*5]             ; round 5
+	aesdec  xmm8, [keys + 16*6]             ; round 6
+	aesdec  xmm8, [keys + 16*7]             ; round 7
+	aesdec  xmm8, [keys + 16*8]             ; round 8
+	aesdec  xmm8, [keys + 16*9]             ; round 9
+	aesdec  xmm8, [keys + 16*10]            ; round 9
+	aesdec  xmm8, [keys + 16*11]            ; round 9
+	aesdec  xmm8, [keys + 16*12]            ; round 9
+	aesdec  xmm8, [keys + 16*13]            ; round 9
+	aesdeclast      xmm8, [keys + 16*14]    ; round 10
+
+	; xor Tweak value
+	pxor    xmm8, [TW]
+
+_done:
+	; store last ciphertext value
+	movdqu  [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+	mov     rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rdi, [_gpr + 8*1]
+	mov     rsi, [_gpr + 8*2]
+
+
+	movdqa  xmm6, [_xmm + 16*0]
+	movdqa  xmm7, [_xmm + 16*1]
+	movdqa  xmm8, [_xmm + 16*2]
+	movdqa  xmm9, [_xmm + 16*3]
+	movdqa  xmm10, [_xmm + 16*4]
+	movdqa  xmm11, [_xmm + 16*5]
+	movdqa  xmm12, [_xmm + 16*6]
+	movdqa  xmm13, [_xmm + 16*7]
+	movdqa  xmm14, [_xmm + 16*8]
+	movdqa  xmm15, [_xmm + 16*9]
+%endif
+
+	add     rsp, VARIABLE_OFFSET
+
+	ret
+
+
+
+
+
+_less_than_128_bytes:
+	cmp N_val, 16
+	jb _ret_
+
+	mov     tmp1, N_val
+	and     tmp1, (7 << 4)
+	cmp     tmp1, (6 << 4)
+	je      _num_blocks_is_6
+	cmp     tmp1, (5 << 4)
+	je      _num_blocks_is_5
+	cmp     tmp1, (4 << 4)
+	je      _num_blocks_is_4
+	cmp     tmp1, (3 << 4)
+	je      _num_blocks_is_3
+	cmp     tmp1, (2 << 4)
+	je      _num_blocks_is_2
+	cmp     tmp1, (1 << 4)
+	je      _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+	sub     ptr_plaintext, 16*1
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_7
+
+_steal_cipher_7:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm15
+	movdqa  xmm15, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	movdqa  xmm8, xmm7
+	jmp     _steal_cipher
+
+_done_7:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	movdqa  xmm8, xmm7
+	jmp     _done
+
+
+
+
+
+
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+	sub     ptr_plaintext, 16*2
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_6
+
+_steal_cipher_6:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm14
+	movdqa  xmm14, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	movdqa  xmm8, xmm6
+	jmp     _steal_cipher
+
+_done_6:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	movdqa  xmm8, xmm6
+	jmp     _done
+
+
+
+
+
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+	sub     ptr_plaintext, 16*3
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_5
+
+_steal_cipher_5:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm13
+	movdqa  xmm13, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	movdqa  xmm8, xmm5
+	jmp     _steal_cipher
+
+_done_5:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	movdqa  xmm8, xmm5
+	jmp     _done
+
+
+
+
+
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+	sub     ptr_plaintext, 16*4
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_4
+
+_steal_cipher_4:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm12
+	movdqa  xmm12, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	movdqa  xmm8, xmm4
+	jmp     _steal_cipher
+
+_done_4:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	movdqa  xmm8, xmm4
+	jmp     _done
+
+
+
+
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+	sub     ptr_plaintext, 16*5
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_3
+
+_steal_cipher_3:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm11
+	movdqa  xmm11, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	movdqa  xmm8, xmm3
+	jmp     _steal_cipher
+
+_done_3:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	movdqa  xmm8, xmm3
+	jmp     _done
+
+
+
+
+
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+	sub     ptr_plaintext, 16*6
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_2
+
+_steal_cipher_2:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm10
+	movdqa  xmm10, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	movdqa  xmm8, xmm2
+	jmp     _steal_cipher
+
+_done_2:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	movdqa  xmm8, xmm2
+	jmp     _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+	sub     ptr_plaintext, 16*7
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done_1
+
+_steal_cipher_1:
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW+8*2], twtempl
+	mov     [TW+8*3], twtemph
+
+	movdqa  [TW + 16*0] , xmm9
+	movdqa  xmm9, [TW+16*1]
+
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	movdqa  xmm8, xmm1
+	jmp     _steal_cipher
+
+_done_1:
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	movdqa  xmm8, xmm1
+	jmp     _done
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_vaes.asm
new file mode 100644
index 000000000..3e26e5c04
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_vaes.asm
@@ -0,0 +1,1875 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*23     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*23     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*33     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_avx(
+;               UINT8 *k2,      // key used for tweaking, 16*2 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*2 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *pt,        // plaintext sector input data
+;               UINT8 *ct);     // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+%define zpoly   zmm25
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro  key_expansion_256_flip  3
+%define %%xraw_key      %1
+%define %%xtmp  %2
+%define %%xround_key    %3
+	vpshufd  %%xraw_key,  %%xraw_key, 11111111b
+	vshufps  %%xtmp, %%xround_key, 00010000b
+	vpxor    %%xround_key, %%xtmp
+	vshufps  %%xtmp, %%xround_key, 10001100b
+	vpxor    %%xround_key, %%xtmp
+	vpxor    %%xround_key,  %%xraw_key
+%endmacro
+
+%macro  key_expansion_256_flop  3
+%define %%xraw_key      %1
+%define %%xtmp  %2
+%define %%xround_key    %3
+	vpshufd  %%xraw_key,  %%xraw_key, 10101010b
+	vshufps  %%xtmp, %%xround_key, 00010000b
+	vpxor    %%xround_key, %%xtmp
+	vshufps  %%xtmp, %%xround_key, 10001100b
+	vpxor    %%xround_key, %%xtmp
+	vpxor    %%xround_key,  %%xraw_key
+%endmacro
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 11
+%define %%xkey2         %1
+%define %%xkey2_2       %2
+%define %%xstate_tweak  %3
+%define %%xkey1         %4
+%define %%xkey1_2       %5
+%define %%xraw_key      %6
+%define %%xtmp          %7
+%define %%xtmp2         %8
+%define %%ptr_key2      %9
+%define %%ptr_key1      %10
+%define %%ptr_expanded_keys     %11
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2]
+	vpxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1]
+	vmovdqa  [%%ptr_expanded_keys+16*14], %%xkey1
+
+	vmovdqu  %%xkey2_2, [%%ptr_key2 + 16*1]
+	vaesenc  %%xstate_tweak, %%xkey2_2                       ; round 1 for tweak encryption
+
+	vmovdqu  %%xkey1_2, [%%ptr_key1 + 16*1]
+	vaesimc  %%xtmp2, %%xkey1_2
+	vmovdqa  [%%ptr_expanded_keys+16*13], %%xtmp2
+
+
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x1      ; Generating round key 2 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x1      ; Generating round key 2 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 2 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys+16*12], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x1        ; Generating round key 3 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x1        ; Generating round key 3 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 3 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1_2
+	vmovdqa                  [%%ptr_expanded_keys+16*11], %%xtmp2
+
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x2      ; Generating round key 4 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x2      ; Generating round key 4 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 4 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys+16*10], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x2        ; Generating round key 5 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x2        ; Generating round key 5 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 5 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1_2
+	vmovdqa                  [%%ptr_expanded_keys+16*9], %%xtmp2
+
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x4      ; Generating round key 6 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x4      ; Generating round key 6 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 6 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys+16*8], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x4        ; Generating round key 7 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x4        ; Generating round key 7 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 7 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1_2
+	vmovdqa                  [%%ptr_expanded_keys+16*7], %%xtmp2
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x8      ; Generating round key 8 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x8      ; Generating round key 8 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 8 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys+16*6], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x8        ; Generating round key 9 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x8        ; Generating round key 9 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 9 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1_2
+	vmovdqa                  [%%ptr_expanded_keys+16*5], %%xtmp2
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x10     ; Generating round key 10 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x10     ; Generating round key 10 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 10 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys+16*4], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x10       ; Generating round key 11 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x10       ; Generating round key 11 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 11 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1_2
+	vmovdqa                  [%%ptr_expanded_keys+16*3], %%xtmp2
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x20     ; Generating round key 12 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x20     ; Generating round key 12 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 12 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1
+	vmovdqa                  [%%ptr_expanded_keys+16*2], %%xtmp2
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x20       ; Generating round key 13 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x20       ; Generating round key 13 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 13 for tweak encryption
+	vaesimc                  %%xtmp2, %%xkey1_2
+	vmovdqa                  [%%ptr_expanded_keys+16*1], %%xtmp2
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x40     ; Generating round key 14 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x40     ; Generating round key 14 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenclast              %%xstate_tweak, %%xkey2         ; round 14 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*0], %%xkey1
+
+	vmovdqa  [TW], %%xstate_tweak    ; Store the encrypted Tweak value
+%endmacro
+
+
+; Original way to generate initial tweak values and load plaintext values
+; only used for small blocks
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		vmovdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		vmovdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		vmovdqa  %%TW2, [TW+16*1]
+		vmovdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		vmovdqa  %%TW3, [TW+16*2]
+		vmovdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		vmovdqa  %%TW4, [TW+16*3]
+		vmovdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		vmovdqa  %%TW5, [TW+16*4]
+		vmovdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		vmovdqa  %%TW6, [TW+16*5]
+		vmovdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		vmovdqa  %%TW7, [TW+16*6]
+		vmovdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+%endmacro
+
+
+; Original decrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
+; next 8 Tweak values can be generated
+%macro  decrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks decrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	vmovdqa  %%T0, [keys]
+	vpxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	vmovdqa  %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	vmovdqa  %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	vmovdqa  %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	vmovdqa  %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	vmovdqa  %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	vmovdqa  %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	vmovdqa  %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	vmovdqa  %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	vmovdqa  %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+	; round 10
+	vmovdqa  %%T0, [keys + 16*10]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+	; round 11
+	vmovdqa  %%T0, [keys + 16*11]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	; round 12
+	vmovdqa  %%T0, [keys + 16*12]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	; round 13
+	vmovdqa  %%T0, [keys + 16*13]
+	vaesdec  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdec  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdec  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdec  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdec  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdec  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdec  %%ST7, %%T0
+%endif
+
+	; round 14
+	vmovdqa  %%T0, [keys + 16*14]
+	vaesdeclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesdeclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesdeclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesdeclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesdeclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesdeclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesdeclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+
+
+; Decrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  decrypt_by_eight_zmm 6
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%TW1   %3      ; tweak 1
+%define %%TW2   %4      ; tweak 2
+%define %%T0    %5     ; Temp register
+%define %%last_eight     %6
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+
+	; ARK
+	vbroadcasti32x4 %%T0, [keys]
+	vpxorq    %%ST1, %%T0
+	vpxorq    %%ST2, %%T0
+
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW1, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm15, %%TW1, 1
+		vpxord		zmm15, zmm15, zmm14
+%endif
+	; round 1
+	vbroadcasti32x4 %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 2
+	vbroadcasti32x4 %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 3
+	vbroadcasti32x4 %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW2, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm16, %%TW2, 1
+		vpxord		zmm16, zmm16, zmm14
+%endif
+	; round 4
+	vbroadcasti32x4 %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 5
+	vbroadcasti32x4 %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 6
+	vbroadcasti32x4 %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 7
+	vbroadcasti32x4 %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 8
+	vbroadcasti32x4 %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 9
+	vbroadcasti32x4 %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 10
+	vbroadcasti32x4 %%T0, [keys + 16*10]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 11
+	vbroadcasti32x4 %%T0, [keys + 16*11]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 12
+	vbroadcasti32x4 %%T0, [keys + 16*12]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 13
+	vbroadcasti32x4 %%T0, [keys + 16*13]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+
+	; round 14
+	vbroadcasti32x4 %%T0, [keys + 16*14]
+	vaesdeclast      %%ST1, %%T0
+	vaesdeclast      %%ST2, %%T0
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+
+	; load next Tweak values
+	vmovdqa32  %%TW1, zmm15
+	vmovdqa32  %%TW2, zmm16
+%endmacro
+
+
+; Decrypt 16 blocks in parallel
+; generate next 8 tweak values
+%macro  decrypt_by_16_zmm 10
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+
+%define %%TW1   %5      ; tweak 1
+%define %%TW2   %6      ; tweak 2
+%define %%TW3   %7      ; tweak 3
+%define %%TW4   %8      ; tweak 4
+
+%define %%T0    %9     ; Temp register
+%define %%last_eight     %10
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+	vpxorq    %%ST3, %%TW3
+	vpxorq    %%ST4, %%TW4
+
+	; ARK
+	vbroadcasti32x4 %%T0, [keys]
+	vpxorq    %%ST1, %%T0
+	vpxorq    %%ST2, %%T0
+	vpxorq    %%ST3, %%T0
+	vpxorq    %%ST4, %%T0
+
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW3, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm15, %%TW3, 1
+		vpxord		zmm15, zmm15, zmm14
+%endif
+	; round 1
+	vbroadcasti32x4 %%T0, [keys + 16*1]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 2
+	vbroadcasti32x4 %%T0, [keys + 16*2]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 3
+	vbroadcasti32x4 %%T0, [keys + 16*3]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW4, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm16, %%TW4, 1
+		vpxord		zmm16, zmm16, zmm14
+%endif
+	; round 4
+	vbroadcasti32x4 %%T0, [keys + 16*4]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 5
+	vbroadcasti32x4 %%T0, [keys + 16*5]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 6
+	vbroadcasti32x4 %%T0, [keys + 16*6]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, zmm15, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm17, zmm15, 1
+		vpxord		zmm17, zmm17, zmm14
+%endif
+	; round 7
+	vbroadcasti32x4 %%T0, [keys + 16*7]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 8
+	vbroadcasti32x4 %%T0, [keys + 16*8]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 9
+	vbroadcasti32x4 %%T0, [keys + 16*9]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, zmm16, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm18, zmm16, 1
+		vpxord		zmm18, zmm18, zmm14
+%endif
+	; round 10
+	vbroadcasti32x4 %%T0, [keys + 16*10]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 11
+	vbroadcasti32x4 %%T0, [keys + 16*11]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 12
+	vbroadcasti32x4 %%T0, [keys + 16*12]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 13
+	vbroadcasti32x4 %%T0, [keys + 16*13]
+	vaesdec  %%ST1, %%T0
+	vaesdec  %%ST2, %%T0
+	vaesdec  %%ST3, %%T0
+	vaesdec  %%ST4, %%T0
+
+	; round 14
+	vbroadcasti32x4 %%T0, [keys + 16*14]
+	vaesdeclast      %%ST1, %%T0
+	vaesdeclast      %%ST2, %%T0
+	vaesdeclast      %%ST3, %%T0
+	vaesdeclast      %%ST4, %%T0
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+	vpxorq    %%ST3, %%TW3
+	vpxorq    %%ST4, %%TW4
+
+	; load next Tweak values
+	vmovdqa32  %%TW1, zmm15
+	vmovdqa32  %%TW2, zmm16
+	vmovdqa32  %%TW3, zmm17
+	vmovdqa32  %%TW4, zmm18
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_dec_vaes, function
+XTS_AES_256_dec_vaes:
+	endbranch
+
+%define ALIGN_STACK
+%ifdef ALIGN_STACK
+	push		rbp
+	mov		rbp, rsp
+	sub		rsp, VARIABLE_OFFSET
+	and		rsp, ~63
+%else
+	sub		rsp, VARIABLE_OFFSET
+%endif
+
+	mov		[_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov		[_gpr + 8*1], rdi
+	mov		[_gpr + 8*2], rsi
+
+	vmovdqa		[_xmm + 16*0], xmm6
+	vmovdqa		[_xmm + 16*1], xmm7
+	vmovdqa		[_xmm + 16*2], xmm8
+	vmovdqa		[_xmm + 16*3], xmm9
+	vmovdqa		[_xmm + 16*4], xmm10
+	vmovdqa		[_xmm + 16*5], xmm11
+	vmovdqa		[_xmm + 16*6], xmm12
+	vmovdqa		[_xmm + 16*7], xmm13
+	vmovdqa		[_xmm + 16*8], xmm14
+	vmovdqa		[_xmm + 16*9], xmm15
+%endif
+
+	mov		ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	vmovdqu		xmm1, [T_val]                   ; read initial Tweak value
+	vpxor		xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, xmm7, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]	; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]	; ciphertext pointer
+%endif
+
+	cmp		N_val, 128
+	jl              _less_than_128_bytes
+
+	vpbroadcastq	zpoly, ghash_poly_8b
+
+	cmp		N_val, 256
+	jge		_start_by16
+
+	cmp		N_val, 128
+	jge		_start_by8
+
+_do_n_blocks:
+	cmp		N_val, 0
+	je		_ret_
+
+	cmp		N_val, (7*16)
+	jge		_remaining_num_blocks_is_7
+
+	cmp		N_val, (6*16)
+	jge		_remaining_num_blocks_is_6
+
+	cmp		N_val, (5*16)
+	jge		_remaining_num_blocks_is_5
+
+	cmp		N_val, (4*16)
+	jge		_remaining_num_blocks_is_4
+
+	cmp		N_val, (3*16)
+	jge		_remaining_num_blocks_is_3
+
+	cmp		N_val, (2*16)
+	jge		_remaining_num_blocks_is_2
+
+	cmp		N_val, (1*16)
+	jge		_remaining_num_blocks_is_1
+
+;; _remaining_num_blocks_is_0:
+	vmovdqu		xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext - 16], xmm1
+	vmovdqa		xmm8, xmm1
+
+	; Calc previous tweak
+	mov		tmp1, 1
+	kmovq		k1, tmp1
+	vpsllq		xmm13, xmm9, 63
+	vpsraq		xmm14, xmm13, 63
+	vpandq		xmm5, xmm14, XWORD(zpoly)
+	vpxorq		xmm9 {k1}, xmm9, xmm5
+	vpsrldq		xmm10, xmm9, 8
+	vpshrdq		xmm0, xmm9, xmm10, 1
+	vpslldq		xmm13, xmm13, 8
+	vpxorq		xmm0, xmm0, xmm13
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_7:
+	mov		tmp1, -1
+	shr		tmp1, 16
+	kmovq		k1, tmp1
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2 {k1}, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*7
+	and		N_val, 15
+	je		_done_7_remain
+	vextracti32x4	xmm12, zmm10, 2
+	vextracti32x4	xmm13, zmm10, 3
+	vinserti32x4	zmm10, xmm13, 2
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4] {k1}, zmm2
+	add		ptr_ciphertext, 16*7
+	vextracti32x4	xmm8, zmm2, 0x2
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_7_remain:
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4] {k1}, zmm2
+	jmp		_ret_
+
+_remaining_num_blocks_is_6:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	ymm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*6
+	and		N_val, 15
+	je		_done_6_remain
+	vextracti32x4	xmm12, zmm10, 1
+	vextracti32x4	xmm13, zmm10, 2
+	vinserti32x4	zmm10, xmm13, 1
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], ymm2
+	add		ptr_ciphertext, 16*6
+	vextracti32x4	xmm8, zmm2, 0x1
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_6_remain:
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], ymm2
+	jmp		_ret_
+
+_remaining_num_blocks_is_5:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*5
+	and		N_val, 15
+	je		_done_5_remain
+	vmovdqa		xmm12, xmm10
+	vextracti32x4	xmm10, zmm10, 1
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu		[ptr_ciphertext+16*4], xmm2
+	add		ptr_ciphertext, 16*5
+	vmovdqa		xmm8, xmm2
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_5_remain:
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu		[ptr_ciphertext+16*4], xmm2
+	jmp		_ret_
+
+_remaining_num_blocks_is_4:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	add		ptr_plaintext, 16*4
+	and		N_val, 15
+	je		_done_4_remain
+	vextracti32x4	xmm12, zmm9, 3
+	vinserti32x4	zmm9, xmm10, 3
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	add		ptr_ciphertext, 16*4
+	vextracti32x4	xmm8, zmm1, 0x3
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_4_remain:
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	jmp		_ret_
+
+_remaining_num_blocks_is_3:
+	vmovdqu		xmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*1]
+	vmovdqu		xmm3, [ptr_plaintext+16*2]
+	add		ptr_plaintext, 16*3
+	and		N_val, 15
+	je		_done_3_remain
+	vextracti32x4	xmm13, zmm9, 2
+	vextracti32x4	xmm10, zmm9, 1
+	vextracti32x4	xmm11, zmm9, 3
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	add		ptr_ciphertext, 16*3
+	vmovdqa		xmm8, xmm3
+	vmovdqa		xmm0, xmm13
+	jmp		_steal_cipher
+_done_3_remain:
+	vextracti32x4	xmm10, zmm9, 1
+	vextracti32x4	xmm11, zmm9, 2
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	jmp		_ret_
+
+_remaining_num_blocks_is_2:
+	vmovdqu		xmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*1]
+	add		ptr_plaintext, 16*2
+	and		N_val, 15
+	je		_done_2_remain
+	vextracti32x4	xmm10, zmm9, 2
+	vextracti32x4	xmm12, zmm9, 1
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	add		ptr_ciphertext, 16*2
+	vmovdqa		xmm8, xmm2
+	vmovdqa		xmm0, xmm12
+	jmp		_steal_cipher
+_done_2_remain:
+	vextracti32x4	xmm10, zmm9, 1
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	jmp		_ret_
+
+_remaining_num_blocks_is_1:
+	vmovdqu		xmm1, [ptr_plaintext]
+	add		ptr_plaintext, 16
+	and		N_val, 15
+	je		_done_1_remain
+	vextracti32x4	xmm11, zmm9, 1
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16
+	vmovdqa		xmm8, xmm1
+	vmovdqa		xmm0, xmm9
+	jmp		_steal_cipher
+_done_1_remain:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	jmp		_ret_
+
+
+
+_start_by16:
+	; Make first 7 tweek values
+	vbroadcasti32x4	zmm0, [TW]
+	vbroadcasti32x4	zmm8, [shufb_15_7]
+	mov		tmp1, 0xaa
+	kmovq		k2, tmp1
+
+	; Mult tweak by 2^{3, 2, 1, 0}
+	vpshufb		zmm1, zmm0, zmm8		; mov 15->0, 7->8
+	vpsllvq		zmm4, zmm0, [const_dq3210]	; shift l 3,2,1,0
+	vpsrlvq		zmm2, zmm1, [const_dq5678]	; shift r 5,6,7,8
+	vpclmulqdq      zmm3, zmm2, zpoly, 0x00
+	vpxorq		zmm4 {k2}, zmm4, zmm2		; tweaks shifted by 3-0
+	vpxord		zmm9, zmm3, zmm4
+
+	; Mult tweak by 2^{7, 6, 5, 4}
+	vpsllvq		zmm5, zmm0, [const_dq7654]	; shift l 7,6,5,4
+	vpsrlvq		zmm6, zmm1, [const_dq1234]	; shift r 1,2,3,4
+	vpclmulqdq      zmm7, zmm6, zpoly, 0x00
+	vpxorq		zmm5 {k2}, zmm5, zmm6		; tweaks shifted by 7-4
+	vpxord		zmm10, zmm7, zmm5
+
+	; Make next 8 tweek values by all x 2^8
+	vpsrldq		zmm13, zmm9, 15
+	vpclmulqdq	zmm14, zmm13, zpoly, 0
+	vpslldq		zmm11, zmm9, 1
+	vpxord		zmm11, zmm11, zmm14
+
+	vpsrldq		zmm15, zmm10, 15
+	vpclmulqdq	zmm16, zmm15, zpoly, 0
+	vpslldq		zmm12, zmm10, 1
+	vpxord		zmm12, zmm12, zmm16
+
+_main_loop_run_16:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2, [ptr_plaintext+16*4]
+	vmovdqu8	zmm3, [ptr_plaintext+16*8]
+	vmovdqu8	zmm4, [ptr_plaintext+16*12]
+	add		ptr_plaintext, 256
+
+	decrypt_by_16_zmm  zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
+
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], zmm2
+	vmovdqu8	[ptr_ciphertext+16*8], zmm3
+	vmovdqu8	[ptr_ciphertext+16*12], zmm4
+	add		ptr_ciphertext, 256
+	sub		N_val, 256
+	cmp		N_val, 256
+	jge		_main_loop_run_16
+
+	cmp		N_val, 128
+	jge		_main_loop_run_8
+
+	jmp		_do_n_blocks
+
+_start_by8:
+	; Make first 7 tweek values
+	vbroadcasti32x4	zmm0, [TW]
+	vbroadcasti32x4	zmm8, [shufb_15_7]
+	mov		tmp1, 0xaa
+	kmovq		k2, tmp1
+
+	; Mult tweak by 2^{3, 2, 1, 0}
+	vpshufb		zmm1, zmm0, zmm8		; mov 15->0, 7->8
+	vpsllvq		zmm4, zmm0, [const_dq3210]	; shift l 3,2,1,0
+	vpsrlvq		zmm2, zmm1, [const_dq5678]	; shift r 5,6,7,8
+	vpclmulqdq      zmm3, zmm2, zpoly, 0x00
+	vpxorq		zmm4 {k2}, zmm4, zmm2		; tweaks shifted by 3-0
+	vpxord		zmm9, zmm3, zmm4
+
+	; Mult tweak by 2^{7, 6, 5, 4}
+	vpsllvq		zmm5, zmm0, [const_dq7654]	; shift l 7,6,5,4
+	vpsrlvq		zmm6, zmm1, [const_dq1234]	; shift r 1,2,3,4
+	vpclmulqdq      zmm7, zmm6, zpoly, 0x00
+	vpxorq		zmm5 {k2}, zmm5, zmm6		; tweaks shifted by 7-4
+	vpxord		zmm10, zmm7, zmm5
+
+_main_loop_run_8:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 128
+
+	decrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 0
+
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], zmm2
+	add		ptr_ciphertext, 128
+	sub		N_val, 128
+	cmp		N_val, 128
+	jge		_main_loop_run_8
+
+	jmp		_do_n_blocks
+
+_steal_cipher:
+	; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
+	vmovdqa		xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea		twtempl, [vpshufb_shf_table]
+	vmovdqu		xmm10, [twtempl+N_val]
+	vpshufb		xmm8, xmm10
+
+	vmovdqu		xmm3, [ptr_plaintext - 16 + N_val]
+	vmovdqu		[ptr_ciphertext - 16 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea		twtempl, [vpshufb_shf_table +16]
+	sub		twtempl, N_val
+	vmovdqu		xmm10, [twtempl]
+	vpxor		xmm10, [mask1]
+	vpshufb		xmm3, xmm10
+
+	vpblendvb	xmm3, xmm3, xmm2, xmm10
+
+	; xor Tweak value
+	vpxor		xmm8, xmm3, xmm0
+
+	;decrypt last block with cipher stealing
+	vpxor		xmm8, [keys]		; ARK
+	vaesdec		xmm8, [keys + 16*1]	; round 1
+	vaesdec		xmm8, [keys + 16*2]	; round 2
+	vaesdec		xmm8, [keys + 16*3]	; round 3
+	vaesdec		xmm8, [keys + 16*4]	; round 4
+	vaesdec		xmm8, [keys + 16*5]	; round 5
+	vaesdec		xmm8, [keys + 16*6]	; round 6
+	vaesdec		xmm8, [keys + 16*7]	; round 7
+	vaesdec		xmm8, [keys + 16*8]	; round 8
+	vaesdec		xmm8, [keys + 16*9]	; round 9
+	vaesdec		xmm8, [keys + 16*10]	; round 10
+	vaesdec		xmm8, [keys + 16*11]	; round 11
+	vaesdec		xmm8, [keys + 16*12]	; round 12
+	vaesdec		xmm8, [keys + 16*13]	; round 13
+	vaesdeclast	xmm8, [keys + 16*14]	; round 14
+
+	; xor Tweak value
+	vpxor		xmm8, xmm8, xmm0
+
+_done:
+	; store last ciphertext value
+	vmovdqu		[ptr_ciphertext - 16], xmm8
+
+_ret_:
+	mov		rbx, [_gpr + 8*0]
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov		rdi, [_gpr + 8*1]
+	mov		rsi, [_gpr + 8*2]
+
+	vmovdqa		xmm6, [_xmm + 16*0]
+	vmovdqa		xmm7, [_xmm + 16*1]
+	vmovdqa		xmm8, [_xmm + 16*2]
+	vmovdqa		xmm9, [_xmm + 16*3]
+	vmovdqa		xmm10, [_xmm + 16*4]
+	vmovdqa		xmm11, [_xmm + 16*5]
+	vmovdqa		xmm12, [_xmm + 16*6]
+	vmovdqa		xmm13, [_xmm + 16*7]
+	vmovdqa		xmm14, [_xmm + 16*8]
+	vmovdqa		xmm15, [_xmm + 16*9]
+%endif
+
+%ifndef ALIGN_STACK
+	add		rsp, VARIABLE_OFFSET
+%else
+	mov		rsp, rbp
+	pop		rbp
+%endif
+	ret
+
+
+_less_than_128_bytes:
+	cmp		N_val, 16
+	jb		_ret_
+
+	mov		tmp1, N_val
+	and		tmp1, (7 << 4)
+	cmp		tmp1, (6 << 4)
+	je		_num_blocks_is_6
+	cmp		tmp1, (5 << 4)
+	je		_num_blocks_is_5
+	cmp		tmp1, (4 << 4)
+	je		_num_blocks_is_4
+	cmp		tmp1, (3 << 4)
+	je		_num_blocks_is_3
+	cmp		tmp1, (2 << 4)
+	je		_num_blocks_is_2
+	cmp		tmp1, (1 << 4)
+	je		_num_blocks_is_1
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add		ptr_plaintext, 16*7
+	and		N_val, 15
+	je		_done_7
+
+_steal_cipher_7:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa64	xmm16, xmm15
+	vmovdqa		xmm15, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+	vmovdqu		[ptr_ciphertext+16*5], xmm6
+	add		ptr_ciphertext, 16*7
+	vmovdqa64	xmm0, xmm16
+	vmovdqa		xmm8, xmm7
+	jmp		_steal_cipher
+
+_done_7:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+	vmovdqu		[ptr_ciphertext+16*5], xmm6
+	add		ptr_ciphertext, 16*7
+	vmovdqa		xmm8, xmm7
+	jmp		_done
+
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add		ptr_plaintext, 16*6
+	and		N_val, 15
+	je		 _done_6
+
+_steal_cipher_6:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm15, xmm14
+	vmovdqa		xmm14, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	add		ptr_ciphertext, 16*6
+	vmovdqa		xmm0, xmm15
+	vmovdqa		xmm8, xmm6
+	jmp		_steal_cipher
+
+_done_6:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	add		ptr_ciphertext, 16*6
+	vmovdqa		xmm8, xmm6
+	jmp		_done
+
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add		ptr_plaintext, 16*5
+	and		N_val, 15
+	je		_done_5
+
+_steal_cipher_5:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm14, xmm13
+	vmovdqa		xmm13, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	add		ptr_ciphertext, 16*5
+	vmovdqa		xmm0, xmm14
+	vmovdqa		xmm8, xmm5
+	jmp		_steal_cipher
+
+_done_5:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	add		ptr_ciphertext, 16*5
+	vmovdqa		xmm8, xmm5
+	jmp		_done
+
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add		ptr_plaintext, 16*4
+	and		N_val, 15
+	je		_done_4
+
+_steal_cipher_4:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm13, xmm12
+	vmovdqa		xmm12, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	add		ptr_ciphertext, 16*4
+	vmovdqa		xmm0, xmm13
+	vmovdqa		xmm8, xmm4
+	jmp		_steal_cipher
+
+_done_4:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	add		ptr_ciphertext, 16*4
+	vmovdqa		xmm8, xmm4
+	jmp		_done
+
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add		ptr_plaintext, 16*3
+	and		N_val, 15
+	je		_done_3
+
+_steal_cipher_3:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm12, xmm11
+	vmovdqa		xmm11, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	add		ptr_ciphertext, 16*3
+	vmovdqa		xmm0, xmm12
+	vmovdqa		xmm8, xmm3
+	jmp		_steal_cipher
+
+_done_3:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	add		ptr_ciphertext, 16*3
+	vmovdqa		xmm8, xmm3
+	jmp		_done
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add		ptr_plaintext, 16*2
+	and		N_val, 15
+	je		_done_2
+
+_steal_cipher_2:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm11, xmm10
+	vmovdqa		xmm10, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16*2
+	vmovdqa		xmm0, xmm11
+	vmovdqa		xmm8, xmm2
+	jmp		_steal_cipher
+
+_done_2:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16*2
+	vmovdqa		xmm8, xmm2
+	jmp		_done
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add		ptr_plaintext, 16*1
+	and		N_val, 15
+	je		_done_1
+
+_steal_cipher_1:
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW+8*2], twtempl
+	mov		[TW+8*3], twtemph
+	vmovdqa		xmm10, xmm9
+	vmovdqa		xmm9, [TW+16*1]
+
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	add		ptr_ciphertext, 16*1
+	vmovdqa		xmm0, xmm10
+	vmovdqa		xmm8, xmm1
+	jmp		_steal_cipher
+
+_done_1:
+	decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	add		ptr_ciphertext, 16*1
+	vmovdqa		xmm8, xmm1
+	jmp		_done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
+const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
+const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
+const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
+
+shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+%else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_XTS_AES_256_dec_vaes
+no_XTS_AES_256_dec_vaes:
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm
new file mode 100644
index 000000000..0993ff909
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm
@@ -0,0 +1,1708 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*23     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*23     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*33     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_avx(
+;               UINT8 *k2,      // key used for tweaking, 16*2 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*2 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *pt,        // plaintext sector input data
+;               UINT8 *ct);     // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define target_ptr_val          rsi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define target_ptr_val          rdx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro  key_expansion_256_flip  3
+%define %%xraw_key      %1
+%define %%xtmp  %2
+%define %%xround_key    %3
+	vpshufd  %%xraw_key,  %%xraw_key, 11111111b
+	vshufps  %%xtmp, %%xround_key, 00010000b
+	vpxor    %%xround_key, %%xtmp
+	vshufps  %%xtmp, %%xround_key, 10001100b
+	vpxor    %%xround_key, %%xtmp
+	vpxor    %%xround_key,  %%xraw_key
+%endmacro
+
+%macro  key_expansion_256_flop  3
+%define %%xraw_key      %1
+%define %%xtmp  %2
+%define %%xround_key    %3
+	vpshufd  %%xraw_key,  %%xraw_key, 10101010b
+	vshufps  %%xtmp, %%xround_key, 00010000b
+	vpxor    %%xround_key, %%xtmp
+	vshufps  %%xtmp, %%xround_key, 10001100b
+	vpxor    %%xround_key, %%xtmp
+	vpxor    %%xround_key,  %%xraw_key
+%endmacro
+
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 10
+%define %%xkey2         %1
+%define %%xkey2_2       %2
+%define %%xstate_tweak  %3
+%define %%xkey1         %4
+%define %%xkey1_2       %5
+%define %%xraw_key      %6
+%define %%xtmp          %7
+%define %%ptr_key2      %8
+%define %%ptr_key1      %9
+%define %%ptr_expanded_keys     %10
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2]
+	vpxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1]
+	vmovdqa  [%%ptr_expanded_keys+16*0], %%xkey1
+
+	vmovdqu  %%xkey2_2, [%%ptr_key2 + 16*1]
+	vaesenc  %%xstate_tweak, %%xkey2_2                       ; round 1 for tweak encryption
+
+	vmovdqu  %%xkey1_2, [%%ptr_key1 + 16*1]
+	vmovdqa  [%%ptr_expanded_keys+16*1], %%xkey1_2
+
+
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x1      ; Generating round key 2 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x1      ; Generating round key 2 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 2 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*2], %%xkey1
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x1        ; Generating round key 3 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x1        ; Generating round key 3 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 3 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*3], %%xkey1_2
+
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x2      ; Generating round key 4 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x2      ; Generating round key 4 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 4 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*4], %%xkey1
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x2        ; Generating round key 5 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x2        ; Generating round key 5 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 5 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*5], %%xkey1_2
+
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x4      ; Generating round key 6 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x4      ; Generating round key 6 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 6 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*6], %%xkey1
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x4        ; Generating round key 7 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x4        ; Generating round key 7 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 7 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*7], %%xkey1_2
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x8      ; Generating round key 8 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x8      ; Generating round key 8 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 8 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*8], %%xkey1
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x8        ; Generating round key 9 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x8        ; Generating round key 9 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 9 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*9], %%xkey1_2
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x10     ; Generating round key 10 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x10     ; Generating round key 10 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 10 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*10], %%xkey1
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x10       ; Generating round key 11 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x10       ; Generating round key 11 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 11 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*11], %%xkey1_2
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x20     ; Generating round key 12 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x20     ; Generating round key 12 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 12 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*12], %%xkey1
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x20       ; Generating round key 13 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x20       ; Generating round key 13 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 13 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*13], %%xkey1_2
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x40     ; Generating round key 14 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x40     ; Generating round key 14 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenclast              %%xstate_tweak, %%xkey2         ; round 14 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*14], %%xkey1
+
+	vmovdqa  [TW], %%xstate_tweak    ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		vmovdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		vmovdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		vmovdqa  %%TW2, [TW+16*1]
+		vmovdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		vmovdqa  %%TW3, [TW+16*2]
+		vmovdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		vmovdqa  %%TW4, [TW+16*3]
+		vmovdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		vmovdqa  %%TW5, [TW+16*4]
+		vmovdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		vmovdqa  %%TW6, [TW+16*5]
+		vmovdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		vmovdqa  %%TW7, [TW+16*6]
+		vmovdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro  encrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	vmovdqa  %%T0, [keys]
+	vpxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	vmovdqa  %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	vmovdqa  %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	vmovdqa  %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	vmovdqa  %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	vmovdqa  %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	vmovdqa  %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	vmovdqa  %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	vmovdqa  %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	vmovdqa  %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+	; round 10
+	vmovdqa  %%T0, [keys + 16*10]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	; round 11
+	vmovdqa  %%T0, [keys + 16*11]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	; round 12
+	vmovdqa  %%T0, [keys + 16*12]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	; round 13
+	vmovdqa  %%T0, [keys + 16*13]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	; round 14
+	vmovdqa  %%T0, [keys + 16*14]
+	vaesenclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_eight 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%TW8   %16     ; tweak 8
+%define %%T0    %17     ; Temp register
+%define %%last_eight     %18
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+	vpxor    %%ST2, %%TW2
+	vpxor    %%ST3, %%TW3
+	vpxor    %%ST4, %%TW4
+	vpxor    %%ST5, %%TW5
+	vpxor    %%ST6, %%TW6
+	vpxor    %%ST7, %%TW7
+	vpxor    %%ST8, %%TW8
+
+	; ARK
+	vmovdqa %%T0, [keys]
+	vpxor    %%ST1, %%T0
+	vpxor    %%ST2, %%T0
+	vpxor    %%ST3, %%T0
+	vpxor    %%ST4, %%T0
+	vpxor    %%ST5, %%T0
+	vpxor    %%ST6, %%T0
+	vpxor    %%ST7, %%T0
+	vpxor    %%ST8, %%T0
+
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 1
+	vmovdqa %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 2
+	vmovdqa %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+
+%endif
+	; round 3
+	vmovdqa %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*2], twtempl
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 4
+	vmovdqa %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl
+%endif
+	; round 5
+	vmovdqa %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 6
+	vmovdqa %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl
+		mov     [TW + 8*7], twtemph
+%endif
+	; round 7
+	vmovdqa %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 8
+	vmovdqa %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl
+		mov     [TW + 8*9], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 9
+	vmovdqa %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+%endif
+	; round 10
+	vmovdqa %%T0, [keys + 16*10]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*10], twtempl
+		mov     [TW + 8*11], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 11
+	vmovdqa %%T0, [keys + 16*11]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl
+%endif
+	; round 12
+	vmovdqa %%T0, [keys + 16*12]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*13], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 13
+	vmovdqa %%T0, [keys + 16*13]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+;               mov     [TW + 8*14], twtempl
+;               mov     [TW + 8*15], twtemph
+%endif
+	; round 14
+	vmovdqa %%T0, [keys + 16*14]
+	vaesenclast      %%ST1, %%T0
+	vaesenclast      %%ST2, %%T0
+	vaesenclast      %%ST3, %%T0
+	vaesenclast      %%ST4, %%T0
+	vaesenclast      %%ST5, %%T0
+	vaesenclast      %%ST6, %%T0
+	vaesenclast      %%ST7, %%T0
+	vaesenclast      %%ST8, %%T0
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+	vpxor    %%ST2, %%TW2
+	vpxor    %%ST3, %%TW3
+	vpxor    %%ST4, %%TW4
+	vpxor    %%ST5, %%TW5
+	vpxor    %%ST6, %%TW6
+	vpxor    %%ST7, %%TW7
+	vpxor    %%ST8, %%TW8
+
+		mov     [TW + 8*14], twtempl
+		mov     [TW + 8*15], twtemph
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_enc_avx, function
+XTS_AES_256_enc_avx:
+	endbranch
+
+	sub     rsp, VARIABLE_OFFSET
+
+	mov     [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [_gpr + 8*1], rdi
+	mov     [_gpr + 8*2], rsi
+
+	vmovdqa  [_xmm + 16*0], xmm6
+	vmovdqa  [_xmm + 16*1], xmm7
+	vmovdqa  [_xmm + 16*2], xmm8
+	vmovdqa  [_xmm + 16*3], xmm9
+	vmovdqa  [_xmm + 16*4], xmm10
+	vmovdqa  [_xmm + 16*5], xmm11
+	vmovdqa  [_xmm + 16*6], xmm12
+	vmovdqa  [_xmm + 16*7], xmm13
+	vmovdqa  [_xmm + 16*8], xmm14
+	vmovdqa  [_xmm + 16*9], xmm15
+%endif
+
+	mov     ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	vmovdqu  xmm1, [T_val]                   ; read initial Tweak value
+	vpxor    xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]            ; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]           ; ciphertext pointer
+%endif
+
+
+
+	mov             target_ptr_val, N_val
+	and             target_ptr_val, -16             ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+	sub             target_ptr_val, 128             ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+	jl              _less_than_128_bytes
+
+	add             target_ptr_val, ptr_ciphertext
+
+
+	mov             tmp1, N_val
+	and             tmp1, (7 << 4)
+	jz              _initial_num_blocks_is_0
+
+	cmp             tmp1, (4 << 4)
+	je              _initial_num_blocks_is_4
+
+
+
+	cmp             tmp1, (6 << 4)
+	je              _initial_num_blocks_is_6
+
+	cmp             tmp1, (5 << 4)
+	je              _initial_num_blocks_is_5
+
+
+
+	cmp             tmp1, (3 << 4)
+	je              _initial_num_blocks_is_3
+
+	cmp             tmp1, (2 << 4)
+	je              _initial_num_blocks_is_2
+
+	cmp             tmp1, (1 << 4)
+	je              _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	add     ptr_ciphertext, 16*7
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	add     ptr_ciphertext, 16*6
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	add     ptr_ciphertext, 16*5
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	add     ptr_ciphertext, 16*4
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+
+_initial_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	add     ptr_ciphertext, 16*3
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+	vmovdqu  [ptr_ciphertext+16], xmm2
+	add     ptr_ciphertext, 16*2
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+	add     ptr_ciphertext, 16
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_0:
+		mov             twtempl, [TW+8*0]
+		mov             twtemph, [TW+8*1]
+		vmovdqa          xmm9, [TW+16*0]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*2], twtempl
+		mov             [TW+8*3], twtemph
+		vmovdqa          xmm10, [TW+16*1]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*4], twtempl
+		mov             [TW+8*5], twtemph
+		vmovdqa          xmm11, [TW+16*2]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*6], twtempl
+		mov             [TW+8*7], twtemph
+		vmovdqa          xmm12, [TW+16*3]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*8], twtempl
+		mov             [TW+8*9], twtemph
+		vmovdqa          xmm13, [TW+16*4]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*10], twtempl
+		mov             [TW+8*11], twtemph
+		vmovdqa          xmm14, [TW+16*5]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*12], twtempl
+		mov             [TW+8*13], twtemph
+		vmovdqa          xmm15, [TW+16*6]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*14], twtempl
+		mov             [TW+8*15], twtemph
+		;vmovdqa         xmm16, [TW+16*7]
+
+		cmp             ptr_ciphertext, target_ptr_val
+		je              _last_eight
+_main_loop:
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+
+	add     ptr_plaintext, 128
+
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	vmovdqu  [ptr_ciphertext+16*7], xmm8
+	add     ptr_ciphertext, 128
+
+	cmp     ptr_ciphertext, target_ptr_val
+	jne     _main_loop
+
+_last_eight:
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+_steal_cipher:
+	; start cipher stealing
+
+	; generate next Tweak value
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW], twtempl
+	mov     [TW + 8], twtemph
+
+	vmovdqa  xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea     twtempl, [vpshufb_shf_table]
+	vmovdqu  xmm0, [twtempl+N_val]
+	vpshufb  xmm8, xmm0
+
+
+	vmovdqu  xmm3, [ptr_plaintext + 112 + N_val]      ; state register is temporarily xmm3 to eliminate a move
+	vmovdqu  [ptr_ciphertext + 112 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea     twtempl, [vpshufb_shf_table +16]
+	sub     twtempl, N_val
+	vmovdqu  xmm0, [twtempl]
+	vpxor    xmm0, [mask1]
+	vpshufb  xmm3, xmm0
+
+	vpblendvb       xmm3,  xmm3, xmm2, xmm0      ;xmm0 is implicit
+
+	; xor Tweak value
+	vmovdqa  xmm8, [TW]
+	vpxor    xmm8, xmm3      ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+	;encrypt last block with cipher stealing
+	vpxor    xmm8, [keys]                    ; ARK
+	vaesenc  xmm8, [keys + 16*1]             ; round 1
+	vaesenc  xmm8, [keys + 16*2]             ; round 2
+	vaesenc  xmm8, [keys + 16*3]             ; round 3
+	vaesenc  xmm8, [keys + 16*4]             ; round 4
+	vaesenc  xmm8, [keys + 16*5]             ; round 5
+	vaesenc  xmm8, [keys + 16*6]             ; round 6
+	vaesenc  xmm8, [keys + 16*7]             ; round 7
+	vaesenc  xmm8, [keys + 16*8]             ; round 8
+	vaesenc  xmm8, [keys + 16*9]             ; round 9
+	vaesenc  xmm8, [keys + 16*10]            ; round 9
+	vaesenc  xmm8, [keys + 16*11]            ; round 9
+	vaesenc  xmm8, [keys + 16*12]            ; round 9
+	vaesenc  xmm8, [keys + 16*13]            ; round 9
+	vaesenclast      xmm8, [keys + 16*14]    ; round 10
+
+	; xor Tweak value
+	vpxor    xmm8, [TW]
+
+_done:
+	; store last ciphertext value
+	vmovdqu  [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+	mov     rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rdi, [_gpr + 8*1]
+	mov     rsi, [_gpr + 8*2]
+
+
+	vmovdqa  xmm6, [_xmm + 16*0]
+	vmovdqa  xmm7, [_xmm + 16*1]
+	vmovdqa  xmm8, [_xmm + 16*2]
+	vmovdqa  xmm9, [_xmm + 16*3]
+	vmovdqa  xmm10, [_xmm + 16*4]
+	vmovdqa  xmm11, [_xmm + 16*5]
+	vmovdqa  xmm12, [_xmm + 16*6]
+	vmovdqa  xmm13, [_xmm + 16*7]
+	vmovdqa  xmm14, [_xmm + 16*8]
+	vmovdqa  xmm15, [_xmm + 16*9]
+%endif
+
+	add     rsp, VARIABLE_OFFSET
+
+	ret
+
+
+
+
+
+_less_than_128_bytes:
+	cmp N_val, 16
+	jb _ret_
+
+	mov     tmp1, N_val
+	and     tmp1, (7 << 4)
+	cmp     tmp1, (6 << 4)
+	je      _num_blocks_is_6
+	cmp     tmp1, (5 << 4)
+	je      _num_blocks_is_5
+	cmp     tmp1, (4 << 4)
+	je      _num_blocks_is_4
+	cmp     tmp1, (3 << 4)
+	je      _num_blocks_is_3
+	cmp     tmp1, (2 << 4)
+	je      _num_blocks_is_2
+	cmp     tmp1, (1 << 4)
+	je      _num_blocks_is_1
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	sub     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	vmovdqa  xmm8, xmm7
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	sub     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	vmovdqa  xmm8, xmm6
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	sub     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	vmovdqa  xmm8, xmm5
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	sub     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	vmovdqa  xmm8, xmm4
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	sub     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	vmovdqa  xmm8, xmm3
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	sub     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	vmovdqa  xmm8, xmm2
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+	sub     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	vmovdqa  xmm8, xmm1
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm
new file mode 100644
index 000000000..6db85486d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm
@@ -0,0 +1,1653 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*23     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*23     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*33     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_expanded_key_avx(
+;               UINT8 *k2,      // key used for tweaking, 16*15 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*15 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *pt,        // plaintext sector input data
+;               UINT8 *ct);     // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define target_ptr_val          rsi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define target_ptr_val          rdx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro  encrypt_T 8
+%define %%xkey2         %1
+%define %%xstate_tweak  %2
+%define %%xkey1         %3
+%define %%xraw_key      %4
+%define %%xtmp          %5
+%define %%ptr_key2      %6
+%define %%ptr_key1      %7
+%define %%ptr_expanded_keys     %8
+
+	vmovdqu  %%xkey2, [%%ptr_key2]
+	vpxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1]
+	vmovdqa  [%%ptr_expanded_keys+16*0], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*1]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 1 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*1]
+	vmovdqa  [%%ptr_expanded_keys+16*1], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*2]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 2 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*2]
+	vmovdqa  [%%ptr_expanded_keys+16*2], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*3]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 3 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*3]
+	vmovdqa  [%%ptr_expanded_keys+16*3], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*4]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 4 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*4]
+	vmovdqa  [%%ptr_expanded_keys+16*4], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*5]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 5 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*5]
+	vmovdqa  [%%ptr_expanded_keys+16*5], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*6]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 6 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*6]
+	vmovdqa  [%%ptr_expanded_keys+16*6], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*7]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 7 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*7]
+	vmovdqa  [%%ptr_expanded_keys+16*7], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*8]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 8 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*8]
+	vmovdqa  [%%ptr_expanded_keys+16*8], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*9]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 9 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*9]
+	vmovdqa  [%%ptr_expanded_keys+16*9], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*10]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 10 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*10]
+	vmovdqa  [%%ptr_expanded_keys+16*10], %%xkey1            ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*11]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 11 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*11]
+	vmovdqa  [%%ptr_expanded_keys+16*11], %%xkey1            ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*12]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 12 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*12]
+	vmovdqa  [%%ptr_expanded_keys+16*12], %%xkey1            ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*13]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 13 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*13]
+	vmovdqa  [%%ptr_expanded_keys+16*13], %%xkey1            ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*14]
+	vaesenclast      %%xstate_tweak, %%xkey2                 ; round 14 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*14]
+	vmovdqa  [%%ptr_expanded_keys+16*14], %%xkey1            ; store round keys in stack
+
+	vmovdqa  [TW], %%xstate_tweak                            ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		vmovdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		vmovdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		vmovdqa  %%TW2, [TW+16*1]
+		vmovdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		vmovdqa  %%TW3, [TW+16*2]
+		vmovdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		vmovdqa  %%TW4, [TW+16*3]
+		vmovdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		vmovdqa  %%TW5, [TW+16*4]
+		vmovdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		vmovdqa  %%TW6, [TW+16*5]
+		vmovdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		vmovdqa  %%TW7, [TW+16*6]
+		vmovdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro  encrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	vmovdqa  %%T0, [keys]
+	vpxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	vmovdqa  %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	vmovdqa  %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	vmovdqa  %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	vmovdqa  %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	vmovdqa  %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	vmovdqa  %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	vmovdqa  %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	vmovdqa  %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	vmovdqa  %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+	; round 10
+	vmovdqa  %%T0, [keys + 16*10]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	; round 11
+	vmovdqa  %%T0, [keys + 16*11]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	; round 12
+	vmovdqa  %%T0, [keys + 16*12]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	; round 13
+	vmovdqa  %%T0, [keys + 16*13]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	; round 14
+	vmovdqa  %%T0, [keys + 16*14]
+	vaesenclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_eight 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%TW8   %16     ; tweak 8
+%define %%T0    %17     ; Temp register
+%define %%last_eight     %18
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+	vpxor    %%ST2, %%TW2
+	vpxor    %%ST3, %%TW3
+	vpxor    %%ST4, %%TW4
+	vpxor    %%ST5, %%TW5
+	vpxor    %%ST6, %%TW6
+	vpxor    %%ST7, %%TW7
+	vpxor    %%ST8, %%TW8
+
+	; ARK
+	vmovdqa %%T0, [keys]
+	vpxor    %%ST1, %%T0
+	vpxor    %%ST2, %%T0
+	vpxor    %%ST3, %%T0
+	vpxor    %%ST4, %%T0
+	vpxor    %%ST5, %%T0
+	vpxor    %%ST6, %%T0
+	vpxor    %%ST7, %%T0
+	vpxor    %%ST8, %%T0
+
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 1
+	vmovdqa %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 2
+	vmovdqa %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+
+%endif
+	; round 3
+	vmovdqa %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*2], twtempl
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 4
+	vmovdqa %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl
+%endif
+	; round 5
+	vmovdqa %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 6
+	vmovdqa %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl
+		mov     [TW + 8*7], twtemph
+%endif
+	; round 7
+	vmovdqa %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 8
+	vmovdqa %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl
+		mov     [TW + 8*9], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 9
+	vmovdqa %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+%endif
+	; round 10
+	vmovdqa %%T0, [keys + 16*10]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*10], twtempl
+		mov     [TW + 8*11], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 11
+	vmovdqa %%T0, [keys + 16*11]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl
+%endif
+	; round 12
+	vmovdqa %%T0, [keys + 16*12]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*13], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 13
+	vmovdqa %%T0, [keys + 16*13]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+	vaesenc  %%ST5, %%T0
+	vaesenc  %%ST6, %%T0
+	vaesenc  %%ST7, %%T0
+	vaesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+;               mov     [TW + 8*14], twtempl
+;               mov     [TW + 8*15], twtemph
+%endif
+	; round 14
+	vmovdqa %%T0, [keys + 16*14]
+	vaesenclast      %%ST1, %%T0
+	vaesenclast      %%ST2, %%T0
+	vaesenclast      %%ST3, %%T0
+	vaesenclast      %%ST4, %%T0
+	vaesenclast      %%ST5, %%T0
+	vaesenclast      %%ST6, %%T0
+	vaesenclast      %%ST7, %%T0
+	vaesenclast      %%ST8, %%T0
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+	vpxor    %%ST2, %%TW2
+	vpxor    %%ST3, %%TW3
+	vpxor    %%ST4, %%TW4
+	vpxor    %%ST5, %%TW5
+	vpxor    %%ST6, %%TW6
+	vpxor    %%ST7, %%TW7
+	vpxor    %%ST8, %%TW8
+
+		mov     [TW + 8*14], twtempl
+		mov     [TW + 8*15], twtemph
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_enc_expanded_key_avx, function
+XTS_AES_256_enc_expanded_key_avx:
+	endbranch
+
+	sub     rsp, VARIABLE_OFFSET
+
+	mov     [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [_gpr + 8*1], rdi
+	mov     [_gpr + 8*2], rsi
+
+	vmovdqa  [_xmm + 16*0], xmm6
+	vmovdqa  [_xmm + 16*1], xmm7
+	vmovdqa  [_xmm + 16*2], xmm8
+	vmovdqa  [_xmm + 16*3], xmm9
+	vmovdqa  [_xmm + 16*4], xmm10
+	vmovdqa  [_xmm + 16*5], xmm11
+	vmovdqa  [_xmm + 16*6], xmm12
+	vmovdqa  [_xmm + 16*7], xmm13
+	vmovdqa  [_xmm + 16*8], xmm14
+	vmovdqa  [_xmm + 16*9], xmm15
+%endif
+
+	mov     ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	vmovdqu  xmm1, [T_val]                   ; read initial Tweak value
+	vpxor    xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]            ; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]           ; ciphertext pointer
+%endif
+
+
+
+	mov             target_ptr_val, N_val
+	and             target_ptr_val, -16             ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+	sub             target_ptr_val, 128             ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+	jl              _less_than_128_bytes
+
+	add             target_ptr_val, ptr_ciphertext
+
+
+	mov             tmp1, N_val
+	and             tmp1, (7 << 4)
+	jz              _initial_num_blocks_is_0
+
+	cmp             tmp1, (4 << 4)
+	je              _initial_num_blocks_is_4
+
+
+
+	cmp             tmp1, (6 << 4)
+	je              _initial_num_blocks_is_6
+
+	cmp             tmp1, (5 << 4)
+	je              _initial_num_blocks_is_5
+
+
+
+	cmp             tmp1, (3 << 4)
+	je              _initial_num_blocks_is_3
+
+	cmp             tmp1, (2 << 4)
+	je              _initial_num_blocks_is_2
+
+	cmp             tmp1, (1 << 4)
+	je              _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	add     ptr_ciphertext, 16*7
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	add     ptr_ciphertext, 16*6
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	add     ptr_ciphertext, 16*5
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	add     ptr_ciphertext, 16*4
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+
+_initial_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	add     ptr_ciphertext, 16*3
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+	vmovdqu  [ptr_ciphertext+16], xmm2
+	add     ptr_ciphertext, 16*2
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+	add     ptr_ciphertext, 16
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_0:
+		mov             twtempl, [TW+8*0]
+		mov             twtemph, [TW+8*1]
+		vmovdqa          xmm9, [TW+16*0]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*2], twtempl
+		mov             [TW+8*3], twtemph
+		vmovdqa          xmm10, [TW+16*1]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*4], twtempl
+		mov             [TW+8*5], twtemph
+		vmovdqa          xmm11, [TW+16*2]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*6], twtempl
+		mov             [TW+8*7], twtemph
+		vmovdqa          xmm12, [TW+16*3]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*8], twtempl
+		mov             [TW+8*9], twtemph
+		vmovdqa          xmm13, [TW+16*4]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*10], twtempl
+		mov             [TW+8*11], twtemph
+		vmovdqa          xmm14, [TW+16*5]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*12], twtempl
+		mov             [TW+8*13], twtemph
+		vmovdqa          xmm15, [TW+16*6]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*14], twtempl
+		mov             [TW+8*15], twtemph
+		;vmovdqa         xmm16, [TW+16*7]
+
+		cmp             ptr_ciphertext, target_ptr_val
+		je              _last_eight
+_main_loop:
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+
+	add     ptr_plaintext, 128
+
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+	vmovdqu  [ptr_ciphertext+16*7], xmm8
+	add     ptr_ciphertext, 128
+
+	cmp     ptr_ciphertext, target_ptr_val
+	jne     _main_loop
+
+_last_eight:
+	; load plaintext
+	vmovdqu  xmm1, [ptr_plaintext+16*0]
+	vmovdqu  xmm2, [ptr_plaintext+16*1]
+	vmovdqu  xmm3, [ptr_plaintext+16*2]
+	vmovdqu  xmm4, [ptr_plaintext+16*3]
+	vmovdqu  xmm5, [ptr_plaintext+16*4]
+	vmovdqu  xmm6, [ptr_plaintext+16*5]
+	vmovdqu  xmm7, [ptr_plaintext+16*6]
+	vmovdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+	vmovdqu  [ptr_ciphertext+16*6], xmm7
+
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+_steal_cipher:
+	; start cipher stealing
+
+	; generate next Tweak value
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW], twtempl
+	mov     [TW + 8], twtemph
+
+	vmovdqa  xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea     twtempl, [vpshufb_shf_table]
+	vmovdqu  xmm0, [twtempl+N_val]
+	vpshufb  xmm8, xmm0
+
+
+	vmovdqu  xmm3, [ptr_plaintext + 112 + N_val]      ; state register is temporarily xmm3 to eliminate a move
+	vmovdqu  [ptr_ciphertext + 112 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea     twtempl, [vpshufb_shf_table +16]
+	sub     twtempl, N_val
+	vmovdqu  xmm0, [twtempl]
+	vpxor    xmm0, [mask1]
+	vpshufb  xmm3, xmm0
+
+	vpblendvb       xmm3,  xmm3, xmm2, xmm0      ;xmm0 is implicit
+
+	; xor Tweak value
+	vmovdqa  xmm8, [TW]
+	vpxor    xmm8, xmm3      ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+	;encrypt last block with cipher stealing
+	vpxor    xmm8, [keys]                    ; ARK
+	vaesenc  xmm8, [keys + 16*1]             ; round 1
+	vaesenc  xmm8, [keys + 16*2]             ; round 2
+	vaesenc  xmm8, [keys + 16*3]             ; round 3
+	vaesenc  xmm8, [keys + 16*4]             ; round 4
+	vaesenc  xmm8, [keys + 16*5]             ; round 5
+	vaesenc  xmm8, [keys + 16*6]             ; round 6
+	vaesenc  xmm8, [keys + 16*7]             ; round 7
+	vaesenc  xmm8, [keys + 16*8]             ; round 8
+	vaesenc  xmm8, [keys + 16*9]             ; round 9
+	vaesenc  xmm8, [keys + 16*10]            ; round 9
+	vaesenc  xmm8, [keys + 16*11]            ; round 9
+	vaesenc  xmm8, [keys + 16*12]            ; round 9
+	vaesenc  xmm8, [keys + 16*13]            ; round 9
+	vaesenclast      xmm8, [keys + 16*14]    ; round 10
+
+	; xor Tweak value
+	vpxor    xmm8, [TW]
+
+_done:
+	; store last ciphertext value
+	vmovdqu  [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+	mov     rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rdi, [_gpr + 8*1]
+	mov     rsi, [_gpr + 8*2]
+
+
+	vmovdqa  xmm6, [_xmm + 16*0]
+	vmovdqa  xmm7, [_xmm + 16*1]
+	vmovdqa  xmm8, [_xmm + 16*2]
+	vmovdqa  xmm9, [_xmm + 16*3]
+	vmovdqa  xmm10, [_xmm + 16*4]
+	vmovdqa  xmm11, [_xmm + 16*5]
+	vmovdqa  xmm12, [_xmm + 16*6]
+	vmovdqa  xmm13, [_xmm + 16*7]
+	vmovdqa  xmm14, [_xmm + 16*8]
+	vmovdqa  xmm15, [_xmm + 16*9]
+%endif
+
+	add     rsp, VARIABLE_OFFSET
+
+	ret
+
+
+
+
+
+_less_than_128_bytes:
+	cmp N_val, 16
+	jb _ret_
+
+	mov     tmp1, N_val
+	and     tmp1, (7 << 4)
+	cmp     tmp1, (6 << 4)
+	je      _num_blocks_is_6
+	cmp     tmp1, (5 << 4)
+	je      _num_blocks_is_5
+	cmp     tmp1, (4 << 4)
+	je      _num_blocks_is_4
+	cmp     tmp1, (3 << 4)
+	je      _num_blocks_is_3
+	cmp     tmp1, (2 << 4)
+	je      _num_blocks_is_2
+	cmp     tmp1, (1 << 4)
+	je      _num_blocks_is_1
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	sub     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+	vmovdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	vmovdqa  xmm8, xmm7
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	sub     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+	vmovdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	vmovdqa  xmm8, xmm6
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	sub     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+	vmovdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	vmovdqa  xmm8, xmm5
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	sub     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+	vmovdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	vmovdqa  xmm8, xmm4
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	sub     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext+16*0], xmm1
+	vmovdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	vmovdqa  xmm8, xmm3
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	sub     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	vmovdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	vmovdqa  xmm8, xmm2
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+	sub     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	vmovdqa  xmm8, xmm1
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm
new file mode 100644
index 000000000..51cb31074
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm
@@ -0,0 +1,1652 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*23     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*23     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*33     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_expanded_key_sse(
+;               UINT8 *k2,      // key used for tweaking, 16*15 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*15 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *pt,        // plaintext sector input data
+;               UINT8 *ct);     // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define target_ptr_val          rsi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define target_ptr_val          rdx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro  encrypt_T 8
+%define %%xkey2         %1
+%define %%xstate_tweak  %2
+%define %%xkey1         %3
+%define %%xraw_key      %4
+%define %%xtmp          %5
+%define %%ptr_key2      %6
+%define %%ptr_key1      %7
+%define %%ptr_expanded_keys     %8
+
+	movdqu  %%xkey2, [%%ptr_key2]
+	pxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1]
+	movdqa  [%%ptr_expanded_keys+16*0], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*1]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 1 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*1]
+	movdqa  [%%ptr_expanded_keys+16*1], %%xkey1             ; store round keys in stack
+
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*2]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 2 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*2]
+	movdqa  [%%ptr_expanded_keys+16*2], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*3]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 3 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*3]
+	movdqa  [%%ptr_expanded_keys+16*3], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*4]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 4 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*4]
+	movdqa  [%%ptr_expanded_keys+16*4], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*5]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 5 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*5]
+	movdqa  [%%ptr_expanded_keys+16*5], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*6]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 6 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*6]
+	movdqa  [%%ptr_expanded_keys+16*6], %%xkey1             ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*7]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 7 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*7]
+	movdqa  [%%ptr_expanded_keys+16*7], %%xkey1             ; store round keys in stack
+
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*8]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 8 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*8]
+	movdqa  [%%ptr_expanded_keys+16*8], %%xkey1             ; store round keys in stack
+
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*9]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 9 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*9]
+	movdqa  [%%ptr_expanded_keys+16*9], %%xkey1             ; store round keys in stack
+
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*10]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 10 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*10]
+	movdqa  [%%ptr_expanded_keys+16*10], %%xkey1            ; store round keys in stack
+
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*11]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 11 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*11]
+	movdqa  [%%ptr_expanded_keys+16*11], %%xkey1            ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*12]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 12 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*12]
+	movdqa  [%%ptr_expanded_keys+16*12], %%xkey1            ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*13]
+	aesenc  %%xstate_tweak, %%xkey2                         ; round 13 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*13]
+	movdqa  [%%ptr_expanded_keys+16*13], %%xkey1            ; store round keys in stack
+
+	movdqu  %%xkey2, [%%ptr_key2 + 16*14]
+	aesenclast      %%xstate_tweak, %%xkey2                 ; round 14 for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1 + 16*14]
+	movdqa  [%%ptr_expanded_keys+16*14], %%xkey1            ; store round keys in stack
+
+	movdqa  [TW], %%xstate_tweak                            ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		movdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		movdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		movdqa  %%TW2, [TW+16*1]
+		movdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		movdqa  %%TW3, [TW+16*2]
+		movdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		movdqa  %%TW4, [TW+16*3]
+		movdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		movdqa  %%TW5, [TW+16*4]
+		movdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		movdqa  %%TW6, [TW+16*5]
+		movdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		movdqa  %%TW7, [TW+16*6]
+		movdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro  encrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	pxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	movdqa  %%T0, [keys]
+	pxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	movdqa  %%T0, [keys + 16*1]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	movdqa  %%T0, [keys + 16*2]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	movdqa  %%T0, [keys + 16*3]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	movdqa  %%T0, [keys + 16*4]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	movdqa  %%T0, [keys + 16*5]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	movdqa  %%T0, [keys + 16*6]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	movdqa  %%T0, [keys + 16*7]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	movdqa  %%T0, [keys + 16*8]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	movdqa  %%T0, [keys + 16*9]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+	; round 10
+	movdqa  %%T0, [keys + 16*10]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+	; round 11
+	movdqa  %%T0, [keys + 16*11]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	; round 12
+	movdqa  %%T0, [keys + 16*12]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	; round 13
+	movdqa  %%T0, [keys + 16*13]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	; round 14
+	movdqa  %%T0, [keys + 16*14]
+	aesenclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		movdqa  %%TW1, [TW + 16*0]
+		movdqa  %%TW2, [TW + 16*1]
+		movdqa  %%TW3, [TW + 16*2]
+		movdqa  %%TW4, [TW + 16*3]
+		movdqa  %%TW5, [TW + 16*4]
+		movdqa  %%TW6, [TW + 16*5]
+		movdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_eight 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%TW8   %16     ; tweak 8
+%define %%T0    %17     ; Temp register
+%define %%last_eight     %18
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+	pxor    %%ST2, %%TW2
+	pxor    %%ST3, %%TW3
+	pxor    %%ST4, %%TW4
+	pxor    %%ST5, %%TW5
+	pxor    %%ST6, %%TW6
+	pxor    %%ST7, %%TW7
+	pxor    %%ST8, %%TW8
+
+	; ARK
+	movdqa %%T0, [keys]
+	pxor    %%ST1, %%T0
+	pxor    %%ST2, %%T0
+	pxor    %%ST3, %%T0
+	pxor    %%ST4, %%T0
+	pxor    %%ST5, %%T0
+	pxor    %%ST6, %%T0
+	pxor    %%ST7, %%T0
+	pxor    %%ST8, %%T0
+
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 1
+	movdqa %%T0, [keys + 16*1]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 2
+	movdqa %%T0, [keys + 16*2]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+
+%endif
+	; round 3
+	movdqa %%T0, [keys + 16*3]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*2], twtempl
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 4
+	movdqa %%T0, [keys + 16*4]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl
+%endif
+	; round 5
+	movdqa %%T0, [keys + 16*5]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 6
+	movdqa %%T0, [keys + 16*6]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl
+		mov     [TW + 8*7], twtemph
+%endif
+	; round 7
+	movdqa %%T0, [keys + 16*7]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 8
+	movdqa %%T0, [keys + 16*8]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl
+		mov     [TW + 8*9], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 9
+	movdqa %%T0, [keys + 16*9]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+%endif
+	; round 10
+	movdqa %%T0, [keys + 16*10]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*10], twtempl
+		mov     [TW + 8*11], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 11
+	movdqa %%T0, [keys + 16*11]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl
+%endif
+	; round 12
+	movdqa %%T0, [keys + 16*12]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*13], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 13
+	movdqa %%T0, [keys + 16*13]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+;               mov     [TW + 8*14], twtempl
+;               mov     [TW + 8*15], twtemph
+%endif
+	; round 14
+	movdqa %%T0, [keys + 16*14]
+	aesenclast      %%ST1, %%T0
+	aesenclast      %%ST2, %%T0
+	aesenclast      %%ST3, %%T0
+	aesenclast      %%ST4, %%T0
+	aesenclast      %%ST5, %%T0
+	aesenclast      %%ST6, %%T0
+	aesenclast      %%ST7, %%T0
+	aesenclast      %%ST8, %%T0
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+	pxor    %%ST2, %%TW2
+	pxor    %%ST3, %%TW3
+	pxor    %%ST4, %%TW4
+	pxor    %%ST5, %%TW5
+	pxor    %%ST6, %%TW6
+	pxor    %%ST7, %%TW7
+	pxor    %%ST8, %%TW8
+
+		mov     [TW + 8*14], twtempl
+		mov     [TW + 8*15], twtemph
+		; load next Tweak values
+		movdqa  %%TW1, [TW + 16*0]
+		movdqa  %%TW2, [TW + 16*1]
+		movdqa  %%TW3, [TW + 16*2]
+		movdqa  %%TW4, [TW + 16*3]
+		movdqa  %%TW5, [TW + 16*4]
+		movdqa  %%TW6, [TW + 16*5]
+		movdqa  %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_enc_expanded_key_sse, function
+XTS_AES_256_enc_expanded_key_sse:
+	endbranch
+
+	sub     rsp, VARIABLE_OFFSET
+
+	mov     [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [_gpr + 8*1], rdi
+	mov     [_gpr + 8*2], rsi
+
+	movdqa  [_xmm + 16*0], xmm6
+	movdqa  [_xmm + 16*1], xmm7
+	movdqa  [_xmm + 16*2], xmm8
+	movdqa  [_xmm + 16*3], xmm9
+	movdqa  [_xmm + 16*4], xmm10
+	movdqa  [_xmm + 16*5], xmm11
+	movdqa  [_xmm + 16*6], xmm12
+	movdqa  [_xmm + 16*7], xmm13
+	movdqa  [_xmm + 16*8], xmm14
+	movdqa  [_xmm + 16*9], xmm15
+%endif
+
+	mov     ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	movdqu  xmm1, [T_val]                   ; read initial Tweak value
+	pxor    xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]            ; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]           ; ciphertext pointer
+%endif
+
+
+
+	mov             target_ptr_val, N_val
+	and             target_ptr_val, -16             ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+	sub             target_ptr_val, 128             ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+	jl              _less_than_128_bytes
+
+	add             target_ptr_val, ptr_ciphertext
+
+
+	mov             tmp1, N_val
+	and             tmp1, (7 << 4)
+	jz              _initial_num_blocks_is_0
+
+	cmp             tmp1, (4 << 4)
+	je              _initial_num_blocks_is_4
+
+
+
+	cmp             tmp1, (6 << 4)
+	je              _initial_num_blocks_is_6
+
+	cmp             tmp1, (5 << 4)
+	je              _initial_num_blocks_is_5
+
+
+
+	cmp             tmp1, (3 << 4)
+	je              _initial_num_blocks_is_3
+
+	cmp             tmp1, (2 << 4)
+	je              _initial_num_blocks_is_2
+
+	cmp             tmp1, (1 << 4)
+	je              _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	add     ptr_ciphertext, 16*7
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	add     ptr_ciphertext, 16*6
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	add     ptr_ciphertext, 16*5
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	add     ptr_ciphertext, 16*4
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+
+_initial_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	add     ptr_ciphertext, 16*3
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+	movdqu  [ptr_ciphertext+16], xmm2
+	add     ptr_ciphertext, 16*2
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+	add     ptr_ciphertext, 16
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_0:
+		mov             twtempl, [TW+8*0]
+		mov             twtemph, [TW+8*1]
+		movdqa          xmm9, [TW+16*0]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*2], twtempl
+		mov             [TW+8*3], twtemph
+		movdqa          xmm10, [TW+16*1]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*4], twtempl
+		mov             [TW+8*5], twtemph
+		movdqa          xmm11, [TW+16*2]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*6], twtempl
+		mov             [TW+8*7], twtemph
+		movdqa          xmm12, [TW+16*3]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*8], twtempl
+		mov             [TW+8*9], twtemph
+		movdqa          xmm13, [TW+16*4]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*10], twtempl
+		mov             [TW+8*11], twtemph
+		movdqa          xmm14, [TW+16*5]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*12], twtempl
+		mov             [TW+8*13], twtemph
+		movdqa          xmm15, [TW+16*6]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*14], twtempl
+		mov             [TW+8*15], twtemph
+		;movdqa         xmm16, [TW+16*7]
+
+		cmp             ptr_ciphertext, target_ptr_val
+		je              _last_eight
+_main_loop:
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+
+	add     ptr_plaintext, 128
+
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	movdqu  [ptr_ciphertext+16*7], xmm8
+	add     ptr_ciphertext, 128
+
+	cmp     ptr_ciphertext, target_ptr_val
+	jne     _main_loop
+
+_last_eight:
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+_steal_cipher:
+	; start cipher stealing
+
+	; generate next Tweak value
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW], twtempl
+	mov     [TW + 8], twtemph
+
+	movdqa  xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea     twtempl, [pshufb_shf_table]
+	movdqu  xmm0, [twtempl+N_val]
+	pshufb  xmm8, xmm0
+
+
+	movdqu  xmm3, [ptr_plaintext + 112 + N_val]      ; state register is temporarily xmm3 to eliminate a move
+	movdqu  [ptr_ciphertext + 112 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea     twtempl, [pshufb_shf_table +16]
+	sub     twtempl, N_val
+	movdqu  xmm0, [twtempl]
+	pxor    xmm0, [mask1]
+	pshufb  xmm3, xmm0
+
+	pblendvb        xmm3, xmm2      ;xmm0 is implicit
+
+	; xor Tweak value
+	movdqa  xmm8, [TW]
+	pxor    xmm8, xmm3      ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+	;encrypt last block with cipher stealing
+	pxor    xmm8, [keys]                    ; ARK
+	aesenc  xmm8, [keys + 16*1]             ; round 1
+	aesenc  xmm8, [keys + 16*2]             ; round 2
+	aesenc  xmm8, [keys + 16*3]             ; round 3
+	aesenc  xmm8, [keys + 16*4]             ; round 4
+	aesenc  xmm8, [keys + 16*5]             ; round 5
+	aesenc  xmm8, [keys + 16*6]             ; round 6
+	aesenc  xmm8, [keys + 16*7]             ; round 7
+	aesenc  xmm8, [keys + 16*8]             ; round 8
+	aesenc  xmm8, [keys + 16*9]             ; round 9
+	aesenc  xmm8, [keys + 16*10]            ; round 9
+	aesenc  xmm8, [keys + 16*11]            ; round 9
+	aesenc  xmm8, [keys + 16*12]            ; round 9
+	aesenc  xmm8, [keys + 16*13]            ; round 9
+	aesenclast      xmm8, [keys + 16*14]    ; round 10
+
+	; xor Tweak value
+	pxor    xmm8, [TW]
+
+_done:
+	; store last ciphertext value
+	movdqu  [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+	mov     rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rdi, [_gpr + 8*1]
+	mov     rsi, [_gpr + 8*2]
+
+
+	movdqa  xmm6, [_xmm + 16*0]
+	movdqa  xmm7, [_xmm + 16*1]
+	movdqa  xmm8, [_xmm + 16*2]
+	movdqa  xmm9, [_xmm + 16*3]
+	movdqa  xmm10, [_xmm + 16*4]
+	movdqa  xmm11, [_xmm + 16*5]
+	movdqa  xmm12, [_xmm + 16*6]
+	movdqa  xmm13, [_xmm + 16*7]
+	movdqa  xmm14, [_xmm + 16*8]
+	movdqa  xmm15, [_xmm + 16*9]
+%endif
+
+	add     rsp, VARIABLE_OFFSET
+
+	ret
+
+
+
+
+
+_less_than_128_bytes:
+	cmp N_val, 16
+	jb _ret_
+
+	mov     tmp1, N_val
+	and     tmp1, (7 << 4)
+	cmp     tmp1, (6 << 4)
+	je      _num_blocks_is_6
+	cmp     tmp1, (5 << 4)
+	je      _num_blocks_is_5
+	cmp     tmp1, (4 << 4)
+	je      _num_blocks_is_4
+	cmp     tmp1, (3 << 4)
+	je      _num_blocks_is_3
+	cmp     tmp1, (2 << 4)
+	je      _num_blocks_is_2
+	cmp     tmp1, (1 << 4)
+	je      _num_blocks_is_1
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	sub     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	movdqa  xmm8, xmm7
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	sub     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	movdqa  xmm8, xmm6
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	sub     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	movdqa  xmm8, xmm5
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	sub     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	movdqa  xmm8, xmm4
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	sub     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	movdqa  xmm8, xmm3
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	sub     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	movdqa  xmm8, xmm2
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+	sub     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	movdqa  xmm8, xmm1
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_vaes.asm
new file mode 100644
index 000000000..37a5dc792
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_vaes.asm
@@ -0,0 +1,1634 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; expanded keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*23     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*23     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*33     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_expanded_key_vaes(
+;               UINT8 *k2,      // key used for tweaking, 16*2 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*2 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *pt,        // plaintext sector input data
+;               UINT8 *ct);     // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+%define zpoly   zmm25
+
+; macro to encrypt the tweak value
+
+%macro  encrypt_T 8
+%define %%xkey2         %1
+%define %%xstate_tweak  %2
+%define %%xkey1         %3
+%define %%xraw_key      %4
+%define %%xtmp          %5
+%define %%ptr_key2      %6
+%define %%ptr_key1      %7
+%define %%ptr_expanded_keys     %8
+
+	vmovdqu  %%xkey2, [%%ptr_key2]
+	vpxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1]
+	vmovdqa  [%%ptr_expanded_keys+16*0], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*1]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 1 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*1]
+	vmovdqa  [%%ptr_expanded_keys+16*1], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*2]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 2 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*2]
+	vmovdqa  [%%ptr_expanded_keys+16*2], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*3]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 3 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*3]
+	vmovdqa  [%%ptr_expanded_keys+16*3], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*4]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 4 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*4]
+	vmovdqa  [%%ptr_expanded_keys+16*4], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*5]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 5 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*5]
+	vmovdqa  [%%ptr_expanded_keys+16*5], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*6]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 6 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*6]
+	vmovdqa  [%%ptr_expanded_keys+16*6], %%xkey1             ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*7]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 7 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*7]
+	vmovdqa  [%%ptr_expanded_keys+16*7], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*8]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 8 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*8]
+	vmovdqa  [%%ptr_expanded_keys+16*8], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*9]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 9 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*9]
+	vmovdqa  [%%ptr_expanded_keys+16*9], %%xkey1             ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*10]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 10 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*10]
+	vmovdqa  [%%ptr_expanded_keys+16*10], %%xkey1            ; store round keys in stack
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*11]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 11 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*11]
+	vmovdqa  [%%ptr_expanded_keys+16*11], %%xkey1            ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*12]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 12 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*12]
+	vmovdqa  [%%ptr_expanded_keys+16*12], %%xkey1            ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*13]
+	vaesenc  %%xstate_tweak, %%xkey2                         ; round 13 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*13]
+	vmovdqa  [%%ptr_expanded_keys+16*13], %%xkey1            ; store round keys in stack
+
+	vmovdqu  %%xkey2, [%%ptr_key2 + 16*14]
+	vaesenclast      %%xstate_tweak, %%xkey2                 ; round 14 for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1 + 16*14]
+	vmovdqa  [%%ptr_expanded_keys+16*14], %%xkey1            ; store round keys in stack
+
+	vmovdqa  [TW], %%xstate_tweak                            ; Store the encrypted Tweak value
+%endmacro
+
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		vmovdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		vmovdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		vmovdqa  %%TW2, [TW+16*1]
+		vmovdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		vmovdqa  %%TW3, [TW+16*2]
+		vmovdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		vmovdqa  %%TW4, [TW+16*3]
+		vmovdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		vmovdqa  %%TW5, [TW+16*4]
+		vmovdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		vmovdqa  %%TW6, [TW+16*5]
+		vmovdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		vmovdqa  %%TW7, [TW+16*6]
+		vmovdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro  encrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	vmovdqa  %%T0, [keys]
+	vpxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	vmovdqa  %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	vmovdqa  %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	vmovdqa  %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	vmovdqa  %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	vmovdqa  %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	vmovdqa  %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	vmovdqa  %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	vmovdqa  %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	vmovdqa  %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+	; round 10
+	vmovdqa  %%T0, [keys + 16*10]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	; round 11
+	vmovdqa  %%T0, [keys + 16*11]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	; round 12
+	vmovdqa  %%T0, [keys + 16*12]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	; round 13
+	vmovdqa  %%T0, [keys + 16*13]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	; round 14
+	vmovdqa  %%T0, [keys + 16*14]
+	vaesenclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_eight_zmm 6
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%TW1   %3      ; tweak 1
+%define %%TW2   %4      ; tweak 2
+%define %%T0    %5     ; Temp register
+%define %%last_eight     %6
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+
+	; ARK
+	vbroadcasti32x4 %%T0, [keys]
+	vpxorq    %%ST1, %%T0
+	vpxorq    %%ST2, %%T0
+
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW1, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm15, %%TW1, 1
+		vpxord		zmm15, zmm15, zmm14
+%endif
+	; round 1
+	vbroadcasti32x4 %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 2
+	vbroadcasti32x4 %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 3
+	vbroadcasti32x4 %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW2, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm16, %%TW2, 1
+		vpxord		zmm16, zmm16, zmm14
+%endif
+	; round 4
+	vbroadcasti32x4 %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 5
+	vbroadcasti32x4 %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 6
+	vbroadcasti32x4 %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 7
+	vbroadcasti32x4 %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 8
+	vbroadcasti32x4 %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 9
+	vbroadcasti32x4 %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 10
+	vbroadcasti32x4 %%T0, [keys + 16*10]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 11
+	vbroadcasti32x4 %%T0, [keys + 16*11]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 12
+	vbroadcasti32x4 %%T0, [keys + 16*12]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 13
+	vbroadcasti32x4 %%T0, [keys + 16*13]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 14
+	vbroadcasti32x4 %%T0, [keys + 16*14]
+	vaesenclast      %%ST1, %%T0
+	vaesenclast      %%ST2, %%T0
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+
+	; load next Tweak values
+	vmovdqa32  %%TW1, zmm15
+	vmovdqa32  %%TW2, zmm16
+%endmacro
+
+
+; Encrypt 16 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_16_zmm 10
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+
+%define %%TW1   %5      ; tweak 1
+%define %%TW2   %6      ; tweak 2
+%define %%TW3   %7      ; tweak 3
+%define %%TW4   %8      ; tweak 4
+
+%define %%T0    %9     ; Temp register
+%define %%last_eight     %10
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+	vpxorq    %%ST3, %%TW3
+	vpxorq    %%ST4, %%TW4
+
+	; ARK
+	vbroadcasti32x4 %%T0, [keys]
+	vpxorq    %%ST1, %%T0
+	vpxorq    %%ST2, %%T0
+	vpxorq    %%ST3, %%T0
+	vpxorq    %%ST4, %%T0
+
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW3, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm15, %%TW3, 1
+		vpxord		zmm15, zmm15, zmm14
+%endif
+	; round 1
+	vbroadcasti32x4 %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 2
+	vbroadcasti32x4 %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 3
+	vbroadcasti32x4 %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW4, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm16, %%TW4, 1
+		vpxord		zmm16, zmm16, zmm14
+%endif
+	; round 4
+	vbroadcasti32x4 %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 5
+	vbroadcasti32x4 %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 6
+	vbroadcasti32x4 %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, zmm15, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm17, zmm15, 1
+		vpxord		zmm17, zmm17, zmm14
+%endif
+	; round 7
+	vbroadcasti32x4 %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 8
+	vbroadcasti32x4 %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 9
+	vbroadcasti32x4 %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, zmm16, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm18, zmm16, 1
+		vpxord		zmm18, zmm18, zmm14
+%endif
+	; round 10
+	vbroadcasti32x4 %%T0, [keys + 16*10]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 11
+	vbroadcasti32x4 %%T0, [keys + 16*11]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 12
+	vbroadcasti32x4 %%T0, [keys + 16*12]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 13
+	vbroadcasti32x4 %%T0, [keys + 16*13]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 14
+	vbroadcasti32x4 %%T0, [keys + 16*14]
+	vaesenclast      %%ST1, %%T0
+	vaesenclast      %%ST2, %%T0
+	vaesenclast      %%ST3, %%T0
+	vaesenclast      %%ST4, %%T0
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+	vpxorq    %%ST3, %%TW3
+	vpxorq    %%ST4, %%TW4
+
+	; load next Tweak values
+	vmovdqa32  %%TW1, zmm15
+	vmovdqa32  %%TW2, zmm16
+	vmovdqa32  %%TW3, zmm17
+	vmovdqa32  %%TW4, zmm18
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_enc_expanded_key_vaes, function
+XTS_AES_256_enc_expanded_key_vaes:
+	endbranch
+
+%define ALIGN_STACK
+%ifdef ALIGN_STACK
+	push		rbp
+	mov		rbp, rsp
+	sub		rsp, VARIABLE_OFFSET
+	and		rsp, ~63
+%else
+	sub		rsp, VARIABLE_OFFSET
+%endif
+
+	mov		[_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov		[_gpr + 8*1], rdi
+	mov		[_gpr + 8*2], rsi
+
+	vmovdqa		[_xmm + 16*0], xmm6
+	vmovdqa		[_xmm + 16*1], xmm7
+	vmovdqa		[_xmm + 16*2], xmm8
+	vmovdqa		[_xmm + 16*3], xmm9
+	vmovdqa		[_xmm + 16*4], xmm10
+	vmovdqa		[_xmm + 16*5], xmm11
+	vmovdqa		[_xmm + 16*6], xmm12
+	vmovdqa		[_xmm + 16*7], xmm13
+	vmovdqa		[_xmm + 16*8], xmm14
+	vmovdqa		[_xmm + 16*9], xmm15
+%endif
+
+	mov		ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	vmovdqu		xmm1, [T_val]                   ; read initial Tweak value
+	vpxor		xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]	; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]	; ciphertext pointer
+%endif
+
+	cmp		N_val, 128
+	jl              _less_than_128_bytes
+
+	vpbroadcastq	zpoly, ghash_poly_8b
+
+	cmp		N_val, 256
+	jge		_start_by16
+
+	cmp		N_val, 128
+	jge		_start_by8
+
+_do_n_blocks:
+	cmp		N_val, 0
+	je		_ret_
+
+	cmp		N_val, (7*16)
+	jge		_remaining_num_blocks_is_7
+
+	cmp		N_val, (6*16)
+	jge		_remaining_num_blocks_is_6
+
+	cmp		N_val, (5*16)
+	jge		_remaining_num_blocks_is_5
+
+	cmp		N_val, (4*16)
+	jge		_remaining_num_blocks_is_4
+
+	cmp		N_val, (3*16)
+	jge		_remaining_num_blocks_is_3
+
+	cmp		N_val, (2*16)
+	jge		_remaining_num_blocks_is_2
+
+	cmp		N_val, (1*16)
+	jge		_remaining_num_blocks_is_1
+
+;; _remaining_num_blocks_is_0:
+	vmovdqa		xmm8, xmm0
+	vmovdqa		xmm0, xmm9
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_7:
+	mov		tmp1, -1
+	shr		tmp1, 16
+	kmovq		k1, tmp1
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2 {k1}, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*7
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4] {k1}, zmm2
+	add		ptr_ciphertext, 16*7
+
+	vextracti32x4	xmm8, zmm2, 0x2
+	vextracti32x4	xmm0, zmm10, 0x3
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_6:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	ymm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*6
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], ymm2
+	add		ptr_ciphertext, 16*6
+
+	vextracti32x4	xmm8, zmm2, 0x1
+	vextracti32x4	xmm0, zmm10, 0x2
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_5:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*5
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu		[ptr_ciphertext+16*4], xmm2
+	add		ptr_ciphertext, 16*5
+
+	movdqa		xmm8, xmm2
+	vextracti32x4	xmm0, zmm10, 0x1
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_4:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	add		ptr_plaintext, 16*4
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	add		ptr_ciphertext, 16*4
+
+	vextracti32x4	xmm8, zmm1, 0x3
+	vextracti32x4	xmm0, zmm10, 0x0
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_3:
+	vextracti32x4	xmm10, zmm9, 1
+	vextracti32x4	xmm11, zmm9, 2
+	vmovdqu		xmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*1]
+	vmovdqu		xmm3, [ptr_plaintext+16*2]
+	add		ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	add		ptr_ciphertext, 16*3
+
+	vmovdqa		xmm8, xmm3
+	vextracti32x4	xmm0, zmm9, 3
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_2:
+	vextracti32x4	xmm10, zmm9, 1
+	vmovdqu		xmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*1]
+	add		ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	add		ptr_ciphertext, 16*2
+
+	vmovdqa		xmm8, xmm2
+	vextracti32x4	xmm0, zmm9, 2
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_1:
+	vmovdqu		xmm1, [ptr_plaintext]
+	add		ptr_plaintext, 16
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16
+
+	vmovdqa		xmm8, xmm1
+	vextracti32x4	xmm0, zmm9, 1
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+
+_start_by16:
+	; Make first 7 tweek values
+	vbroadcasti32x4	zmm0, [TW]
+	vbroadcasti32x4	zmm8, [shufb_15_7]
+	mov		tmp1, 0xaa
+	kmovq		k2, tmp1
+
+	; Mult tweak by 2^{3, 2, 1, 0}
+	vpshufb		zmm1, zmm0, zmm8		; mov 15->0, 7->8
+	vpsllvq		zmm4, zmm0, [const_dq3210]	; shift l 3,2,1,0
+	vpsrlvq		zmm2, zmm1, [const_dq5678]	; shift r 5,6,7,8
+	vpclmulqdq      zmm3, zmm2, zpoly, 0x00
+	vpxorq		zmm4 {k2}, zmm4, zmm2		; tweaks shifted by 3-0
+	vpxord		zmm9, zmm3, zmm4
+
+	; Mult tweak by 2^{7, 6, 5, 4}
+	vpsllvq		zmm5, zmm0, [const_dq7654]	; shift l 7,6,5,4
+	vpsrlvq		zmm6, zmm1, [const_dq1234]	; shift r 1,2,3,4
+	vpclmulqdq      zmm7, zmm6, zpoly, 0x00
+	vpxorq		zmm5 {k2}, zmm5, zmm6		; tweaks shifted by 7-4
+	vpxord		zmm10, zmm7, zmm5
+
+	; Make next 8 tweek values by all x 2^8
+	vpsrldq		zmm13, zmm9, 15
+	vpclmulqdq	zmm14, zmm13, zpoly, 0
+	vpslldq		zmm11, zmm9, 1
+	vpxord		zmm11, zmm11, zmm14
+
+	vpsrldq		zmm15, zmm10, 15
+	vpclmulqdq	zmm16, zmm15, zpoly, 0
+	vpslldq		zmm12, zmm10, 1
+	vpxord		zmm12, zmm12, zmm16
+
+_main_loop_run_16:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2, [ptr_plaintext+16*4]
+	vmovdqu8	zmm3, [ptr_plaintext+16*8]
+	vmovdqu8	zmm4, [ptr_plaintext+16*12]
+	add		ptr_plaintext, 256
+
+	encrypt_by_16_zmm  zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
+
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], zmm2
+	vmovdqu8	[ptr_ciphertext+16*8], zmm3
+	vmovdqu8	[ptr_ciphertext+16*12], zmm4
+	add		ptr_ciphertext, 256
+	sub		N_val, 256
+
+	cmp		N_val, 256
+	jge		_main_loop_run_16
+
+	cmp		N_val, 128
+	jge		_main_loop_run_8
+
+	vextracti32x4	xmm0, zmm4, 0x3 ; keep last crypted block
+	jmp		_do_n_blocks
+
+_start_by8:
+	; Make first 7 tweek values
+	vbroadcasti32x4	zmm0, [TW]
+	vbroadcasti32x4	zmm8, [shufb_15_7]
+	mov		tmp1, 0xaa
+	kmovq		k2, tmp1
+
+	; Mult tweak by 2^{3, 2, 1, 0}
+	vpshufb		zmm1, zmm0, zmm8		; mov 15->0, 7->8
+	vpsllvq		zmm4, zmm0, [const_dq3210]	; shift l 3,2,1,0
+	vpsrlvq		zmm2, zmm1, [const_dq5678]	; shift r 5,6,7,8
+	vpclmulqdq      zmm3, zmm2, zpoly, 0x00
+	vpxorq		zmm4 {k2}, zmm4, zmm2		; tweaks shifted by 3-0
+	vpxord		zmm9, zmm3, zmm4
+
+	; Mult tweak by 2^{7, 6, 5, 4}
+	vpsllvq		zmm5, zmm0, [const_dq7654]	; shift l 7,6,5,4
+	vpsrlvq		zmm6, zmm1, [const_dq1234]	; shift r 1,2,3,4
+	vpclmulqdq      zmm7, zmm6, zpoly, 0x00
+	vpxorq		zmm5 {k2}, zmm5, zmm6		; tweaks shifted by 7-4
+	vpxord		zmm10, zmm7, zmm5
+
+_main_loop_run_8:
+	; load plaintext
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 128
+
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 0
+
+	; store ciphertext
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], zmm2
+	add		ptr_ciphertext, 128
+	sub		N_val, 128
+
+	cmp		N_val, 128
+	jge		_main_loop_run_8
+
+	vextracti32x4	xmm0, zmm2, 0x3 ; keep last crypted block
+	jmp		_do_n_blocks
+
+_steal_cipher_next:
+	; generate next Tweak value
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW], twtempl
+	mov		[TW + 8], twtemph
+	vmovdqa		xmm0, [TW]
+
+_steal_cipher:
+	; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
+	vmovdqa		xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea		twtempl, [vpshufb_shf_table]
+	vmovdqu		xmm10, [twtempl+N_val]
+	vpshufb		xmm8, xmm10
+
+	vmovdqu		xmm3, [ptr_plaintext - 16 + N_val]
+	vmovdqu		[ptr_ciphertext - 16 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea		twtempl, [vpshufb_shf_table +16]
+	sub		twtempl, N_val
+	vmovdqu		xmm10, [twtempl]
+	vpxor		xmm10, [mask1]
+	vpshufb		xmm3, xmm10
+
+	vpblendvb	xmm3, xmm3, xmm2, xmm10
+
+	; xor Tweak value
+	vpxor		xmm8, xmm3, xmm0
+
+	;encrypt last block with cipher stealing
+	vpxor		xmm8, [keys]		; ARK
+	vaesenc		xmm8, [keys + 16*1]	; round 1
+	vaesenc		xmm8, [keys + 16*2]	; round 2
+	vaesenc		xmm8, [keys + 16*3]	; round 3
+	vaesenc		xmm8, [keys + 16*4]	; round 4
+	vaesenc		xmm8, [keys + 16*5]	; round 5
+	vaesenc		xmm8, [keys + 16*6]	; round 6
+	vaesenc		xmm8, [keys + 16*7]	; round 7
+	vaesenc		xmm8, [keys + 16*8]	; round 8
+	vaesenc		xmm8, [keys + 16*9]	; round 9
+	vaesenc		xmm8, [keys + 16*10]	; round 9
+	vaesenc		xmm8, [keys + 16*11]	; round 9
+	vaesenc		xmm8, [keys + 16*12]	; round 9
+	vaesenc		xmm8, [keys + 16*13]	; round 9
+	vaesenclast	xmm8, [keys + 16*14]	; round 10
+
+	; xor Tweak value
+	vpxor		xmm8, xmm8, xmm0
+
+	; store last ciphertext value
+	vmovdqu		[ptr_ciphertext - 16], xmm8
+
+_ret_:
+	mov		rbx, [_gpr + 8*0]
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov		rdi, [_gpr + 8*1]
+	mov		rsi, [_gpr + 8*2]
+
+	vmovdqa		xmm6, [_xmm + 16*0]
+	vmovdqa		xmm7, [_xmm + 16*1]
+	vmovdqa		xmm8, [_xmm + 16*2]
+	vmovdqa		xmm9, [_xmm + 16*3]
+	vmovdqa		xmm10, [_xmm + 16*4]
+	vmovdqa		xmm11, [_xmm + 16*5]
+	vmovdqa		xmm12, [_xmm + 16*6]
+	vmovdqa		xmm13, [_xmm + 16*7]
+	vmovdqa		xmm14, [_xmm + 16*8]
+	vmovdqa		xmm15, [_xmm + 16*9]
+%endif
+
+%ifndef ALIGN_STACK
+	add		rsp, VARIABLE_OFFSET
+%else
+	mov		rsp, rbp
+	pop		rbp
+%endif
+	ret
+
+
+_less_than_128_bytes:
+	cmp		N_val, 16
+	jb		_ret_
+
+	mov		tmp1, N_val
+	and		tmp1, (7 << 4)
+	cmp		tmp1, (6 << 4)
+	je		_num_blocks_is_6
+	cmp		tmp1, (5 << 4)
+	je		_num_blocks_is_5
+	cmp		tmp1, (4 << 4)
+	je		_num_blocks_is_4
+	cmp		tmp1, (3 << 4)
+	je		_num_blocks_is_3
+	cmp		tmp1, (2 << 4)
+	je		_num_blocks_is_2
+	cmp		tmp1, (1 << 4)
+	je		_num_blocks_is_1
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add		ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+	vmovdqu		[ptr_ciphertext+16*5], xmm6
+	vmovdqu		[ptr_ciphertext+16*6], xmm7
+	add		ptr_ciphertext, 16*7
+	vmovdqa		xmm8, xmm7
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add		ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+	vmovdqu		[ptr_ciphertext+16*5], xmm6
+
+	add		ptr_ciphertext, 16*6
+	vmovdqa		xmm8, xmm6
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add		ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+
+	add		ptr_ciphertext, 16*5
+	vmovdqa		xmm8, xmm5
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add		ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+
+	add		ptr_ciphertext, 16*4
+	vmovdqa		xmm8, xmm4
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add		ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+
+	add		ptr_ciphertext, 16*3
+	vmovdqa		xmm8, xmm3
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add		ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext], xmm1
+	vmovdqu		[ptr_ciphertext+16], xmm2
+
+	add		ptr_ciphertext, 16*2
+	vmovdqa		xmm8, xmm2
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add		ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16*1
+	vmovdqa		xmm8, xmm1
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
+const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
+const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
+const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
+
+shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+%else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_XTS_AES_256_enc_expanded_key_vaes
+no_XTS_AES_256_enc_expanded_key_vaes:
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm
new file mode 100644
index 000000000..5b805b74d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm
@@ -0,0 +1,1708 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*23     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*23     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*33     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_sse(
+;               UINT8 *k2,      // key used for tweaking, 16*2 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*2 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *pt,        // plaintext sector input data
+;               UINT8 *ct);     // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define target_ptr_val          rsi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define target_ptr_val          rdx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of aeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro  key_expansion_256_flip  3
+%define %%xraw_key      %1
+%define %%xtmp  %2
+%define %%xround_key    %3
+	pshufd  %%xraw_key,  %%xraw_key, 11111111b
+	shufps  %%xtmp, %%xround_key, 00010000b
+	pxor    %%xround_key, %%xtmp
+	shufps  %%xtmp, %%xround_key, 10001100b
+	pxor    %%xround_key, %%xtmp
+	pxor    %%xround_key,  %%xraw_key
+%endmacro
+
+%macro  key_expansion_256_flop  3
+%define %%xraw_key      %1
+%define %%xtmp  %2
+%define %%xround_key    %3
+	pshufd  %%xraw_key,  %%xraw_key, 10101010b
+	shufps  %%xtmp, %%xround_key, 00010000b
+	pxor    %%xround_key, %%xtmp
+	shufps  %%xtmp, %%xround_key, 10001100b
+	pxor    %%xround_key, %%xtmp
+	pxor    %%xround_key,  %%xraw_key
+%endmacro
+
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 10
+%define %%xkey2         %1
+%define %%xkey2_2       %2
+%define %%xstate_tweak  %3
+%define %%xkey1         %4
+%define %%xkey1_2       %5
+%define %%xraw_key      %6
+%define %%xtmp          %7
+%define %%ptr_key2      %8
+%define %%ptr_key1      %9
+%define %%ptr_expanded_keys     %10
+
+
+	movdqu  %%xkey2, [%%ptr_key2]
+	pxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	movdqu  %%xkey1, [%%ptr_key1]
+	movdqa  [%%ptr_expanded_keys+16*0], %%xkey1
+
+	movdqu  %%xkey2_2, [%%ptr_key2 + 16*1]
+	aesenc  %%xstate_tweak, %%xkey2_2                       ; round 1 for tweak encryption
+
+	movdqu  %%xkey1_2, [%%ptr_key1 + 16*1]
+	movdqa  [%%ptr_expanded_keys+16*1], %%xkey1_2
+
+
+
+
+	aeskeygenassist         %%xraw_key, %%xkey2_2, 0x1      ; Generating round key 2 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1_2, 0x1      ; Generating round key 2 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 2 for tweak encryption
+	movdqa                  [%%ptr_expanded_keys+16*2], %%xkey1
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x1        ; Generating round key 3 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x1        ; Generating round key 3 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	aesenc                  %%xstate_tweak, %%xkey2_2       ; round 3 for tweak encryption
+	movdqa                  [%%ptr_expanded_keys+16*3], %%xkey1_2
+
+
+
+	aeskeygenassist         %%xraw_key, %%xkey2_2, 0x2      ; Generating round key 4 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1_2, 0x2      ; Generating round key 4 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 4 for tweak encryption
+	movdqa                  [%%ptr_expanded_keys+16*4], %%xkey1
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x2        ; Generating round key 5 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x2        ; Generating round key 5 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	aesenc                  %%xstate_tweak, %%xkey2_2       ; round 5 for tweak encryption
+	movdqa                  [%%ptr_expanded_keys+16*5], %%xkey1_2
+
+
+
+	aeskeygenassist         %%xraw_key, %%xkey2_2, 0x4      ; Generating round key 6 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1_2, 0x4      ; Generating round key 6 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 6 for tweak encryption
+	movdqa                  [%%ptr_expanded_keys+16*6], %%xkey1
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x4        ; Generating round key 7 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x4        ; Generating round key 7 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	aesenc                  %%xstate_tweak, %%xkey2_2       ; round 7 for tweak encryption
+	movdqa                  [%%ptr_expanded_keys+16*7], %%xkey1_2
+
+
+	aeskeygenassist         %%xraw_key, %%xkey2_2, 0x8      ; Generating round key 8 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1_2, 0x8      ; Generating round key 8 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 8 for tweak encryption
+	movdqa                  [%%ptr_expanded_keys+16*8], %%xkey1
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x8        ; Generating round key 9 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x8        ; Generating round key 9 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	aesenc                  %%xstate_tweak, %%xkey2_2       ; round 9 for tweak encryption
+	movdqa                  [%%ptr_expanded_keys+16*9], %%xkey1_2
+
+
+	aeskeygenassist         %%xraw_key, %%xkey2_2, 0x10     ; Generating round key 10 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1_2, 0x10     ; Generating round key 10 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 10 for tweak encryption
+	movdqa                  [%%ptr_expanded_keys+16*10], %%xkey1
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x10       ; Generating round key 11 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x10       ; Generating round key 11 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	aesenc                  %%xstate_tweak, %%xkey2_2       ; round 11 for tweak encryption
+	movdqa                  [%%ptr_expanded_keys+16*11], %%xkey1_2
+
+
+	aeskeygenassist         %%xraw_key, %%xkey2_2, 0x20     ; Generating round key 12 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1_2, 0x20     ; Generating round key 12 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	aesenc                  %%xstate_tweak, %%xkey2         ; round 12 for tweak encryption
+	movdqa                  [%%ptr_expanded_keys+16*12], %%xkey1
+
+	aeskeygenassist         %%xraw_key, %%xkey2, 0x20       ; Generating round key 13 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	aeskeygenassist         %%xraw_key, %%xkey1, 0x20       ; Generating round key 13 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	aesenc                  %%xstate_tweak, %%xkey2_2       ; round 13 for tweak encryption
+	movdqa                  [%%ptr_expanded_keys+16*13], %%xkey1_2
+
+
+	aeskeygenassist         %%xraw_key, %%xkey2_2, 0x40     ; Generating round key 14 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	aeskeygenassist         %%xraw_key, %%xkey1_2, 0x40     ; Generating round key 14 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	aesenclast              %%xstate_tweak, %%xkey2         ; round 14 for tweak encryption
+	movdqa                  [%%ptr_expanded_keys+16*14], %%xkey1
+
+	movdqa  [TW], %%xstate_tweak    ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		movdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		movdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		movdqa  %%TW2, [TW+16*1]
+		movdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		movdqa  %%TW3, [TW+16*2]
+		movdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		movdqa  %%TW4, [TW+16*3]
+		movdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		movdqa  %%TW5, [TW+16*4]
+		movdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		movdqa  %%TW6, [TW+16*5]
+		movdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		movdqa  %%TW7, [TW+16*6]
+		movdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro  encrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	pxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	movdqa  %%T0, [keys]
+	pxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	movdqa  %%T0, [keys + 16*1]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	movdqa  %%T0, [keys + 16*2]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	movdqa  %%T0, [keys + 16*3]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	movdqa  %%T0, [keys + 16*4]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	movdqa  %%T0, [keys + 16*5]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	movdqa  %%T0, [keys + 16*6]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	movdqa  %%T0, [keys + 16*7]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	movdqa  %%T0, [keys + 16*8]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	movdqa  %%T0, [keys + 16*9]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+	; round 10
+	movdqa  %%T0, [keys + 16*10]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+	; round 11
+	movdqa  %%T0, [keys + 16*11]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	; round 12
+	movdqa  %%T0, [keys + 16*12]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	; round 13
+	movdqa  %%T0, [keys + 16*13]
+	aesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenc  %%ST7, %%T0
+%endif
+
+	; round 14
+	movdqa  %%T0, [keys + 16*14]
+	aesenclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	aesenclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	aesenclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	aesenclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	aesenclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	aesenclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	aesenclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	pxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	pxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	pxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	pxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	pxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	pxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		movdqa  %%TW1, [TW + 16*0]
+		movdqa  %%TW2, [TW + 16*1]
+		movdqa  %%TW3, [TW + 16*2]
+		movdqa  %%TW4, [TW + 16*3]
+		movdqa  %%TW5, [TW + 16*4]
+		movdqa  %%TW6, [TW + 16*5]
+		movdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_eight 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%TW8   %16     ; tweak 8
+%define %%T0    %17     ; Temp register
+%define %%last_eight     %18
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+	pxor    %%ST2, %%TW2
+	pxor    %%ST3, %%TW3
+	pxor    %%ST4, %%TW4
+	pxor    %%ST5, %%TW5
+	pxor    %%ST6, %%TW6
+	pxor    %%ST7, %%TW7
+	pxor    %%ST8, %%TW8
+
+	; ARK
+	movdqa %%T0, [keys]
+	pxor    %%ST1, %%T0
+	pxor    %%ST2, %%T0
+	pxor    %%ST3, %%T0
+	pxor    %%ST4, %%T0
+	pxor    %%ST5, %%T0
+	pxor    %%ST6, %%T0
+	pxor    %%ST7, %%T0
+	pxor    %%ST8, %%T0
+
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 1
+	movdqa %%T0, [keys + 16*1]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 2
+	movdqa %%T0, [keys + 16*2]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+
+%endif
+	; round 3
+	movdqa %%T0, [keys + 16*3]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*2], twtempl
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 4
+	movdqa %%T0, [keys + 16*4]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl
+%endif
+	; round 5
+	movdqa %%T0, [keys + 16*5]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 6
+	movdqa %%T0, [keys + 16*6]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl
+		mov     [TW + 8*7], twtemph
+%endif
+	; round 7
+	movdqa %%T0, [keys + 16*7]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+%endif
+	; round 8
+	movdqa %%T0, [keys + 16*8]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl
+		mov     [TW + 8*9], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+	; round 9
+	movdqa %%T0, [keys + 16*9]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+%endif
+	; round 10
+	movdqa %%T0, [keys + 16*10]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*10], twtempl
+		mov     [TW + 8*11], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+%endif
+	; round 11
+	movdqa %%T0, [keys + 16*11]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl
+%endif
+	; round 12
+	movdqa %%T0, [keys + 16*12]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		mov     [TW + 8*13], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+%endif
+	; round 13
+	movdqa %%T0, [keys + 16*13]
+	aesenc  %%ST1, %%T0
+	aesenc  %%ST2, %%T0
+	aesenc  %%ST3, %%T0
+	aesenc  %%ST4, %%T0
+	aesenc  %%ST5, %%T0
+	aesenc  %%ST6, %%T0
+	aesenc  %%ST7, %%T0
+	aesenc  %%ST8, %%T0
+%if (0 == %%last_eight)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+;               mov     [TW + 8*14], twtempl
+;               mov     [TW + 8*15], twtemph
+%endif
+	; round 14
+	movdqa %%T0, [keys + 16*14]
+	aesenclast      %%ST1, %%T0
+	aesenclast      %%ST2, %%T0
+	aesenclast      %%ST3, %%T0
+	aesenclast      %%ST4, %%T0
+	aesenclast      %%ST5, %%T0
+	aesenclast      %%ST6, %%T0
+	aesenclast      %%ST7, %%T0
+	aesenclast      %%ST8, %%T0
+
+	; xor Tweak values
+	pxor    %%ST1, %%TW1
+	pxor    %%ST2, %%TW2
+	pxor    %%ST3, %%TW3
+	pxor    %%ST4, %%TW4
+	pxor    %%ST5, %%TW5
+	pxor    %%ST6, %%TW6
+	pxor    %%ST7, %%TW7
+	pxor    %%ST8, %%TW8
+
+		mov     [TW + 8*14], twtempl
+		mov     [TW + 8*15], twtemph
+		; load next Tweak values
+		movdqa  %%TW1, [TW + 16*0]
+		movdqa  %%TW2, [TW + 16*1]
+		movdqa  %%TW3, [TW + 16*2]
+		movdqa  %%TW4, [TW + 16*3]
+		movdqa  %%TW5, [TW + 16*4]
+		movdqa  %%TW6, [TW + 16*5]
+		movdqa  %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_enc_sse, function
+XTS_AES_256_enc_sse:
+	endbranch
+
+	sub     rsp, VARIABLE_OFFSET
+
+	mov     [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [_gpr + 8*1], rdi
+	mov     [_gpr + 8*2], rsi
+
+	movdqa  [_xmm + 16*0], xmm6
+	movdqa  [_xmm + 16*1], xmm7
+	movdqa  [_xmm + 16*2], xmm8
+	movdqa  [_xmm + 16*3], xmm9
+	movdqa  [_xmm + 16*4], xmm10
+	movdqa  [_xmm + 16*5], xmm11
+	movdqa  [_xmm + 16*6], xmm12
+	movdqa  [_xmm + 16*7], xmm13
+	movdqa  [_xmm + 16*8], xmm14
+	movdqa  [_xmm + 16*9], xmm15
+%endif
+
+	mov     ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	movdqu  xmm1, [T_val]                   ; read initial Tweak value
+	pxor    xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]            ; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]           ; ciphertext pointer
+%endif
+
+
+
+	mov             target_ptr_val, N_val
+	and             target_ptr_val, -16             ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+	sub             target_ptr_val, 128             ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+	jl              _less_than_128_bytes
+
+	add             target_ptr_val, ptr_ciphertext
+
+
+	mov             tmp1, N_val
+	and             tmp1, (7 << 4)
+	jz              _initial_num_blocks_is_0
+
+	cmp             tmp1, (4 << 4)
+	je              _initial_num_blocks_is_4
+
+
+
+	cmp             tmp1, (6 << 4)
+	je              _initial_num_blocks_is_6
+
+	cmp             tmp1, (5 << 4)
+	je              _initial_num_blocks_is_5
+
+
+
+	cmp             tmp1, (3 << 4)
+	je              _initial_num_blocks_is_3
+
+	cmp             tmp1, (2 << 4)
+	je              _initial_num_blocks_is_2
+
+	cmp             tmp1, (1 << 4)
+	je              _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	add     ptr_ciphertext, 16*7
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	add     ptr_ciphertext, 16*6
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	add     ptr_ciphertext, 16*5
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	add     ptr_ciphertext, 16*4
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+
+_initial_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	add     ptr_ciphertext, 16*3
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+_initial_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+	movdqu  [ptr_ciphertext+16], xmm2
+	add     ptr_ciphertext, 16*2
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+	add     ptr_ciphertext, 16
+
+	cmp     ptr_ciphertext, target_ptr_val
+	je      _last_eight
+
+	jmp     _main_loop
+
+_initial_num_blocks_is_0:
+		mov             twtempl, [TW+8*0]
+		mov             twtemph, [TW+8*1]
+		movdqa          xmm9, [TW+16*0]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*2], twtempl
+		mov             [TW+8*3], twtemph
+		movdqa          xmm10, [TW+16*1]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*4], twtempl
+		mov             [TW+8*5], twtemph
+		movdqa          xmm11, [TW+16*2]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*6], twtempl
+		mov             [TW+8*7], twtemph
+		movdqa          xmm12, [TW+16*3]
+
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*8], twtempl
+		mov             [TW+8*9], twtemph
+		movdqa          xmm13, [TW+16*4]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*10], twtempl
+		mov             [TW+8*11], twtemph
+		movdqa          xmm14, [TW+16*5]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*12], twtempl
+		mov             [TW+8*13], twtemph
+		movdqa          xmm15, [TW+16*6]
+
+		xor             ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl             twtempl, 1
+		adc             twtemph, twtemph
+		cmovc           ghash_poly_8b_temp, ghash_poly_8b
+		xor             twtempl, ghash_poly_8b_temp
+		mov             [TW+8*14], twtempl
+		mov             [TW+8*15], twtemph
+		;movdqa         xmm16, [TW+16*7]
+
+		cmp             ptr_ciphertext, target_ptr_val
+		je              _last_eight
+_main_loop:
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+
+	add     ptr_plaintext, 128
+
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+	movdqu  [ptr_ciphertext+16*7], xmm8
+	add     ptr_ciphertext, 128
+
+	cmp     ptr_ciphertext, target_ptr_val
+	jne     _main_loop
+
+_last_eight:
+	; load plaintext
+	movdqu  xmm1, [ptr_plaintext+16*0]
+	movdqu  xmm2, [ptr_plaintext+16*1]
+	movdqu  xmm3, [ptr_plaintext+16*2]
+	movdqu  xmm4, [ptr_plaintext+16*3]
+	movdqu  xmm5, [ptr_plaintext+16*4]
+	movdqu  xmm6, [ptr_plaintext+16*5]
+	movdqu  xmm7, [ptr_plaintext+16*6]
+	movdqu  xmm8, [ptr_plaintext+16*7]
+	encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+	movdqu  [ptr_ciphertext+16*6], xmm7
+
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+_steal_cipher:
+	; start cipher stealing
+
+	; generate next Tweak value
+	xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl     twtempl, 1
+	adc     twtemph, twtemph
+	cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	xor     twtempl, ghash_poly_8b_temp
+	mov     [TW], twtempl
+	mov     [TW + 8], twtemph
+
+	movdqa  xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea     twtempl, [pshufb_shf_table]
+	movdqu  xmm0, [twtempl+N_val]
+	pshufb  xmm8, xmm0
+
+
+	movdqu  xmm3, [ptr_plaintext + 112 + N_val]      ; state register is temporarily xmm3 to eliminate a move
+	movdqu  [ptr_ciphertext + 112 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea     twtempl, [pshufb_shf_table +16]
+	sub     twtempl, N_val
+	movdqu  xmm0, [twtempl]
+	pxor    xmm0, [mask1]
+	pshufb  xmm3, xmm0
+
+	pblendvb        xmm3, xmm2      ;xmm0 is implicit
+
+	; xor Tweak value
+	movdqa  xmm8, [TW]
+	pxor    xmm8, xmm3      ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+	;encrypt last block with cipher stealing
+	pxor    xmm8, [keys]                    ; ARK
+	aesenc  xmm8, [keys + 16*1]             ; round 1
+	aesenc  xmm8, [keys + 16*2]             ; round 2
+	aesenc  xmm8, [keys + 16*3]             ; round 3
+	aesenc  xmm8, [keys + 16*4]             ; round 4
+	aesenc  xmm8, [keys + 16*5]             ; round 5
+	aesenc  xmm8, [keys + 16*6]             ; round 6
+	aesenc  xmm8, [keys + 16*7]             ; round 7
+	aesenc  xmm8, [keys + 16*8]             ; round 8
+	aesenc  xmm8, [keys + 16*9]             ; round 9
+	aesenc  xmm8, [keys + 16*10]            ; round 9
+	aesenc  xmm8, [keys + 16*11]            ; round 9
+	aesenc  xmm8, [keys + 16*12]            ; round 9
+	aesenc  xmm8, [keys + 16*13]            ; round 9
+	aesenclast      xmm8, [keys + 16*14]    ; round 10
+
+	; xor Tweak value
+	pxor    xmm8, [TW]
+
+_done:
+	; store last ciphertext value
+	movdqu  [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+	mov     rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rdi, [_gpr + 8*1]
+	mov     rsi, [_gpr + 8*2]
+
+
+	movdqa  xmm6, [_xmm + 16*0]
+	movdqa  xmm7, [_xmm + 16*1]
+	movdqa  xmm8, [_xmm + 16*2]
+	movdqa  xmm9, [_xmm + 16*3]
+	movdqa  xmm10, [_xmm + 16*4]
+	movdqa  xmm11, [_xmm + 16*5]
+	movdqa  xmm12, [_xmm + 16*6]
+	movdqa  xmm13, [_xmm + 16*7]
+	movdqa  xmm14, [_xmm + 16*8]
+	movdqa  xmm15, [_xmm + 16*9]
+%endif
+
+	add     rsp, VARIABLE_OFFSET
+
+	ret
+
+
+
+
+
+_less_than_128_bytes:
+	cmp N_val, 16
+	jb _ret_
+
+	mov     tmp1, N_val
+	and     tmp1, (7 << 4)
+	cmp     tmp1, (6 << 4)
+	je      _num_blocks_is_6
+	cmp     tmp1, (5 << 4)
+	je      _num_blocks_is_5
+	cmp     tmp1, (4 << 4)
+	je      _num_blocks_is_4
+	cmp     tmp1, (3 << 4)
+	je      _num_blocks_is_3
+	cmp     tmp1, (2 << 4)
+	je      _num_blocks_is_2
+	cmp     tmp1, (1 << 4)
+	je      _num_blocks_is_1
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	sub     ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+	movdqu  [ptr_ciphertext+16*5], xmm6
+
+	sub     ptr_ciphertext, 16*1
+	movdqa  xmm8, xmm7
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	sub     ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+	movdqu  [ptr_ciphertext+16*4], xmm5
+
+	sub     ptr_ciphertext, 16*2
+	movdqa  xmm8, xmm6
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	sub     ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+	movdqu  [ptr_ciphertext+16*3], xmm4
+
+	sub     ptr_ciphertext, 16*3
+	movdqa  xmm8, xmm5
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	sub     ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+	movdqu  [ptr_ciphertext+16*2], xmm3
+
+	sub     ptr_ciphertext, 16*4
+	movdqa  xmm8, xmm4
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	sub     ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext+16*0], xmm1
+	movdqu  [ptr_ciphertext+16*1], xmm2
+
+	sub     ptr_ciphertext, 16*5
+	movdqa  xmm8, xmm3
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	sub     ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	movdqu  [ptr_ciphertext], xmm1
+
+	sub     ptr_ciphertext, 16*6
+	movdqa  xmm8, xmm2
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+	sub     ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	; store ciphertext
+
+	sub     ptr_ciphertext, 16*7
+	movdqa  xmm8, xmm1
+
+	and     N_val, 15               ; N_val = N_val mod 16
+	je      _done
+	jmp     _steal_cipher
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_vaes.asm
new file mode 100644
index 000000000..f75497ece
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_vaes.asm
@@ -0,0 +1,1687 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+default rel
+%define TW              rsp     ; store 8 tweak values
+%define keys    rsp + 16*8      ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define _xmm    rsp + 16*23     ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr    rsp + 16*23     ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr    rsp + 16*33     ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3     ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_avx(
+;               UINT8 *k2,      // key used for tweaking, 16*2 bytes
+;               UINT8 *k1,      // key used for "ECB" encryption, 16*2 bytes
+;               UINT8 *TW_initial,      // initial tweak value, 16 bytes
+;               UINT64 N,       // sector size, in bytes
+;               const UINT8 *pt,        // plaintext sector input data
+;               UINT8 *ct);     // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%xdefine ptr_key2 rdi
+	%xdefine ptr_key1 rsi
+	%xdefine T_val rdx
+	%xdefine N_val rcx
+	%xdefine ptr_plaintext r8
+	%xdefine ptr_ciphertext r9
+%else
+	%xdefine ptr_key2 rcx
+	%xdefine ptr_key1 rdx
+	%xdefine T_val r8
+	%xdefine N_val r9
+	%xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+	%xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define tmp1                    rdi
+	%define ghash_poly_8b           r10
+	%define ghash_poly_8b_temp      r11
+%else
+	%define tmp1                    rcx
+	%define ghash_poly_8b           rdi
+	%define ghash_poly_8b_temp      rsi
+%endif
+
+%define twtempl rax     ; global temp registers used for tweak computation
+%define twtemph rbx
+%define zpoly   zmm25
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro  key_expansion_256_flip  3
+%define %%xraw_key      %1
+%define %%xtmp  %2
+%define %%xround_key    %3
+	vpshufd  %%xraw_key,  %%xraw_key, 11111111b
+	vshufps  %%xtmp, %%xround_key, 00010000b
+	vpxor    %%xround_key, %%xtmp
+	vshufps  %%xtmp, %%xround_key, 10001100b
+	vpxor    %%xround_key, %%xtmp
+	vpxor    %%xround_key,  %%xraw_key
+%endmacro
+
+%macro  key_expansion_256_flop  3
+%define %%xraw_key      %1
+%define %%xtmp  %2
+%define %%xround_key    %3
+	vpshufd  %%xraw_key,  %%xraw_key, 10101010b
+	vshufps  %%xtmp, %%xround_key, 00010000b
+	vpxor    %%xround_key, %%xtmp
+	vshufps  %%xtmp, %%xround_key, 10001100b
+	vpxor    %%xround_key, %%xtmp
+	vpxor    %%xround_key,  %%xraw_key
+%endmacro
+
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 10
+%define %%xkey2         %1
+%define %%xkey2_2       %2
+%define %%xstate_tweak  %3
+%define %%xkey1         %4
+%define %%xkey1_2       %5
+%define %%xraw_key      %6
+%define %%xtmp          %7
+%define %%ptr_key2      %8
+%define %%ptr_key1      %9
+%define %%ptr_expanded_keys     %10
+
+
+	vmovdqu  %%xkey2, [%%ptr_key2]
+	vpxor    %%xstate_tweak, %%xkey2                         ; ARK for tweak encryption
+
+	vmovdqu  %%xkey1, [%%ptr_key1]
+	vmovdqa  [%%ptr_expanded_keys+16*0], %%xkey1
+
+	vmovdqu  %%xkey2_2, [%%ptr_key2 + 16*1]
+	vaesenc  %%xstate_tweak, %%xkey2_2                       ; round 1 for tweak encryption
+
+	vmovdqu  %%xkey1_2, [%%ptr_key1 + 16*1]
+	vmovdqa  [%%ptr_expanded_keys+16*1], %%xkey1_2
+
+
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x1      ; Generating round key 2 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x1      ; Generating round key 2 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 2 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*2], %%xkey1
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x1        ; Generating round key 3 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x1        ; Generating round key 3 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 3 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*3], %%xkey1_2
+
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x2      ; Generating round key 4 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x2      ; Generating round key 4 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 4 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*4], %%xkey1
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x2        ; Generating round key 5 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x2        ; Generating round key 5 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 5 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*5], %%xkey1_2
+
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x4      ; Generating round key 6 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x4      ; Generating round key 6 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 6 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*6], %%xkey1
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x4        ; Generating round key 7 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x4        ; Generating round key 7 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 7 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*7], %%xkey1_2
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x8      ; Generating round key 8 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x8      ; Generating round key 8 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 8 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*8], %%xkey1
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x8        ; Generating round key 9 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x8        ; Generating round key 9 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 9 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*9], %%xkey1_2
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x10     ; Generating round key 10 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x10     ; Generating round key 10 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 10 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*10], %%xkey1
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x10       ; Generating round key 11 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x10       ; Generating round key 11 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 11 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*11], %%xkey1_2
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x20     ; Generating round key 12 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x20     ; Generating round key 12 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenc                  %%xstate_tweak, %%xkey2         ; round 12 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*12], %%xkey1
+
+	vaeskeygenassist         %%xraw_key, %%xkey2, 0x20       ; Generating round key 13 for key2
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey2_2
+	vaeskeygenassist         %%xraw_key, %%xkey1, 0x20       ; Generating round key 13 for key1
+	key_expansion_256_flop  %%xraw_key, %%xtmp, %%xkey1_2
+	vaesenc                  %%xstate_tweak, %%xkey2_2       ; round 13 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*13], %%xkey1_2
+
+
+	vaeskeygenassist         %%xraw_key, %%xkey2_2, 0x40     ; Generating round key 14 for key2
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey2
+	vaeskeygenassist         %%xraw_key, %%xkey1_2, 0x40     ; Generating round key 14 for key1
+	key_expansion_256_flip  %%xraw_key, %%xtmp, %%xkey1
+	vaesenclast              %%xstate_tweak, %%xkey2         ; round 14 for tweak encryption
+	vmovdqa                  [%%ptr_expanded_keys+16*14], %%xkey1
+
+	vmovdqa  [TW], %%xstate_tweak    ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro  initialize 16
+
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+
+%define %%num_initial_blocks    %16
+
+
+		; generate next Tweak values
+		vmovdqa  %%TW1, [TW+16*0]
+		mov     twtempl, [TW+8*0]
+		mov     twtemph, [TW+8*1]
+		vmovdqu  %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*2], twtempl
+		mov     [TW+8*3], twtemph;
+		vmovdqa  %%TW2, [TW+16*1]
+		vmovdqu  %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*4], twtempl
+		mov     [TW+8*5], twtemph;
+		vmovdqa  %%TW3, [TW+16*2]
+		vmovdqu  %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*6], twtempl
+		mov     [TW+8*7], twtemph;
+		vmovdqa  %%TW4, [TW+16*3]
+		vmovdqu  %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*8], twtempl
+		mov     [TW+8*9], twtemph;
+		vmovdqa  %%TW5, [TW+16*4]
+		vmovdqu  %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*10], twtempl
+		mov     [TW+8*11], twtemph;
+		vmovdqa  %%TW6, [TW+16*5]
+		vmovdqu  %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW+8*12], twtempl
+		mov     [TW+8*13], twtemph;
+		vmovdqa  %%TW7, [TW+16*6]
+		vmovdqu  %%ST7, [ptr_plaintext+16*6]
+%endif
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro  encrypt_initial 18
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+%define %%ST5   %5      ; state 5
+%define %%ST6   %6      ; state 6
+%define %%ST7   %7      ; state 7
+%define %%ST8   %8      ; state 8
+
+%define %%TW1   %9      ; tweak 1
+%define %%TW2   %10     ; tweak 2
+%define %%TW3   %11     ; tweak 3
+%define %%TW4   %12     ; tweak 4
+%define %%TW5   %13     ; tweak 5
+%define %%TW6   %14     ; tweak 6
+%define %%TW7   %15     ; tweak 7
+%define %%T0    %16     ; Temp register
+%define %%num_blocks    %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128  %18     ; less than 128 bytes
+
+	; xor Tweak value
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+	; ARK
+	vmovdqa  %%T0, [keys]
+	vpxor    %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%T0
+%endif
+
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+	%endif
+
+	; round 1
+	vmovdqa  %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*0], twtempl     ; next Tweak1 generated
+		mov     [TW + 8*1], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+	%endif
+
+	; round 2
+	vmovdqa  %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*2], twtempl ; next Tweak2 generated
+	%endif
+
+	; round 3
+	vmovdqa  %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	%if (0 == %%lt128)
+		mov     [TW + 8*3], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+	%endif
+
+	; round 4
+	vmovdqa  %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*4], twtempl ; next Tweak3 generated
+		mov     [TW + 8*5], twtemph
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+	%endif
+
+	; round 5
+	vmovdqa  %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*6], twtempl ; next Tweak4 generated
+		mov     [TW + 8*7], twtemph
+	%endif
+
+	; round 6
+	vmovdqa  %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*8], twtempl ; next Tweak5 generated
+		mov     [TW + 8*9], twtemph
+	%endif
+
+	; round 7
+	vmovdqa  %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*10], twtempl ; next Tweak6 generated
+		mov     [TW + 8*11], twtemph
+	%endif
+	; round 8
+	vmovdqa  %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*12], twtempl ; next Tweak7 generated
+		mov     [TW + 8*13], twtemph
+	%endif
+	; round 9
+	vmovdqa  %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	%if (0 == %%lt128)
+		xor     ghash_poly_8b_temp, ghash_poly_8b_temp
+		shl     twtempl, 1
+		adc     twtemph, twtemph
+		cmovc   ghash_poly_8b_temp, ghash_poly_8b
+		xor     twtempl, ghash_poly_8b_temp
+		mov     [TW + 8*14], twtempl ; next Tweak8 generated
+		mov     [TW + 8*15], twtemph
+	%endif
+	; round 10
+	vmovdqa  %%T0, [keys + 16*10]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+	; round 11
+	vmovdqa  %%T0, [keys + 16*11]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	; round 12
+	vmovdqa  %%T0, [keys + 16*12]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	; round 13
+	vmovdqa  %%T0, [keys + 16*13]
+	vaesenc  %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenc  %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenc  %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenc  %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenc  %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenc  %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenc  %%ST7, %%T0
+%endif
+
+	; round 14
+	vmovdqa  %%T0, [keys + 16*14]
+	vaesenclast      %%ST1, %%T0
+%if (%%num_blocks>=2)
+	vaesenclast      %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+	vaesenclast      %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+	vaesenclast      %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+	vaesenclast      %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+	vaesenclast      %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+	vaesenclast      %%ST7, %%T0
+%endif
+
+	; xor Tweak values
+	vpxor    %%ST1, %%TW1
+%if (%%num_blocks>=2)
+	vpxor    %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+	vpxor    %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+	vpxor    %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+	vpxor    %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+	vpxor    %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+	vpxor    %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+		; load next Tweak values
+		vmovdqa  %%TW1, [TW + 16*0]
+		vmovdqa  %%TW2, [TW + 16*1]
+		vmovdqa  %%TW3, [TW + 16*2]
+		vmovdqa  %%TW4, [TW + 16*3]
+		vmovdqa  %%TW5, [TW + 16*4]
+		vmovdqa  %%TW6, [TW + 16*5]
+		vmovdqa  %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_eight_zmm 6
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%TW1   %3      ; tweak 1
+%define %%TW2   %4      ; tweak 2
+%define %%T0    %5     ; Temp register
+%define %%last_eight     %6
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+
+	; ARK
+	vbroadcasti32x4 %%T0, [keys]
+	vpxorq    %%ST1, %%T0
+	vpxorq    %%ST2, %%T0
+
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW1, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm15, %%TW1, 1
+		vpxord		zmm15, zmm15, zmm14
+%endif
+	; round 1
+	vbroadcasti32x4 %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 2
+	vbroadcasti32x4 %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 3
+	vbroadcasti32x4 %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW2, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm16, %%TW2, 1
+		vpxord		zmm16, zmm16, zmm14
+%endif
+	; round 4
+	vbroadcasti32x4 %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 5
+	vbroadcasti32x4 %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 6
+	vbroadcasti32x4 %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 7
+	vbroadcasti32x4 %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 8
+	vbroadcasti32x4 %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 9
+	vbroadcasti32x4 %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 10
+	vbroadcasti32x4 %%T0, [keys + 16*10]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 11
+	vbroadcasti32x4 %%T0, [keys + 16*11]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 12
+	vbroadcasti32x4 %%T0, [keys + 16*12]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 13
+	vbroadcasti32x4 %%T0, [keys + 16*13]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+
+	; round 14
+	vbroadcasti32x4 %%T0, [keys + 16*14]
+	vaesenclast      %%ST1, %%T0
+	vaesenclast      %%ST2, %%T0
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+
+	; load next Tweak values
+	vmovdqa32  %%TW1, zmm15
+	vmovdqa32  %%TW2, zmm16
+%endmacro
+
+
+; Encrypt 16 blocks in parallel
+; generate next 8 tweak values
+%macro  encrypt_by_16_zmm 10
+%define %%ST1   %1      ; state 1
+%define %%ST2   %2      ; state 2
+%define %%ST3   %3      ; state 3
+%define %%ST4   %4      ; state 4
+
+%define %%TW1   %5      ; tweak 1
+%define %%TW2   %6      ; tweak 2
+%define %%TW3   %7      ; tweak 3
+%define %%TW4   %8      ; tweak 4
+
+%define %%T0    %9     ; Temp register
+%define %%last_eight     %10
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+	vpxorq    %%ST3, %%TW3
+	vpxorq    %%ST4, %%TW4
+
+	; ARK
+	vbroadcasti32x4 %%T0, [keys]
+	vpxorq    %%ST1, %%T0
+	vpxorq    %%ST2, %%T0
+	vpxorq    %%ST3, %%T0
+	vpxorq    %%ST4, %%T0
+
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW3, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm15, %%TW3, 1
+		vpxord		zmm15, zmm15, zmm14
+%endif
+	; round 1
+	vbroadcasti32x4 %%T0, [keys + 16*1]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 2
+	vbroadcasti32x4 %%T0, [keys + 16*2]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 3
+	vbroadcasti32x4 %%T0, [keys + 16*3]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, %%TW4, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm16, %%TW4, 1
+		vpxord		zmm16, zmm16, zmm14
+%endif
+	; round 4
+	vbroadcasti32x4 %%T0, [keys + 16*4]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 5
+	vbroadcasti32x4 %%T0, [keys + 16*5]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 6
+	vbroadcasti32x4 %%T0, [keys + 16*6]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, zmm15, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm17, zmm15, 1
+		vpxord		zmm17, zmm17, zmm14
+%endif
+	; round 7
+	vbroadcasti32x4 %%T0, [keys + 16*7]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 8
+	vbroadcasti32x4 %%T0, [keys + 16*8]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 9
+	vbroadcasti32x4 %%T0, [keys + 16*9]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+%if (0 == %%last_eight)
+		vpsrldq		zmm13, zmm16, 15
+		vpclmulqdq	zmm14, zmm13, zpoly, 0
+		vpslldq		zmm18, zmm16, 1
+		vpxord		zmm18, zmm18, zmm14
+%endif
+	; round 10
+	vbroadcasti32x4 %%T0, [keys + 16*10]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 11
+	vbroadcasti32x4 %%T0, [keys + 16*11]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 12
+	vbroadcasti32x4 %%T0, [keys + 16*12]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 13
+	vbroadcasti32x4 %%T0, [keys + 16*13]
+	vaesenc  %%ST1, %%T0
+	vaesenc  %%ST2, %%T0
+	vaesenc  %%ST3, %%T0
+	vaesenc  %%ST4, %%T0
+
+	; round 14
+	vbroadcasti32x4 %%T0, [keys + 16*14]
+	vaesenclast      %%ST1, %%T0
+	vaesenclast      %%ST2, %%T0
+	vaesenclast      %%ST3, %%T0
+	vaesenclast      %%ST4, %%T0
+
+	; xor Tweak values
+	vpxorq    %%ST1, %%TW1
+	vpxorq    %%ST2, %%TW2
+	vpxorq    %%ST3, %%TW3
+	vpxorq    %%ST4, %%TW4
+
+	; load next Tweak values
+	vmovdqa32  %%TW1, zmm15
+	vmovdqa32  %%TW2, zmm16
+	vmovdqa32  %%TW3, zmm17
+	vmovdqa32  %%TW4, zmm18
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_enc_vaes, function
+XTS_AES_256_enc_vaes:
+	endbranch
+
+%define ALIGN_STACK
+%ifdef ALIGN_STACK
+	push		rbp
+	mov		rbp, rsp
+	sub		rsp, VARIABLE_OFFSET
+	and		rsp, ~63
+%else
+	sub		rsp, VARIABLE_OFFSET
+%endif
+
+	mov		[_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov		[_gpr + 8*1], rdi
+	mov		[_gpr + 8*2], rsi
+
+	vmovdqa		[_xmm + 16*0], xmm6
+	vmovdqa		[_xmm + 16*1], xmm7
+	vmovdqa		[_xmm + 16*2], xmm8
+	vmovdqa		[_xmm + 16*3], xmm9
+	vmovdqa		[_xmm + 16*4], xmm10
+	vmovdqa		[_xmm + 16*5], xmm11
+	vmovdqa		[_xmm + 16*6], xmm12
+	vmovdqa		[_xmm + 16*7], xmm13
+	vmovdqa		[_xmm + 16*8], xmm14
+	vmovdqa		[_xmm + 16*9], xmm15
+%endif
+
+	mov		ghash_poly_8b, GHASH_POLY       ; load 0x87 to ghash_poly_8b
+
+
+	vmovdqu		xmm1, [T_val]                   ; read initial Tweak value
+	vpxor		xmm4, xmm4                      ; for key expansion
+	encrypt_T       xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov             ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5]	; plaintext pointer
+	mov             ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6]	; ciphertext pointer
+%endif
+
+	cmp		N_val, 128
+	jl              _less_than_128_bytes
+
+	vpbroadcastq	zpoly, ghash_poly_8b
+
+	cmp		N_val, 256
+	jge		_start_by16
+
+	cmp		N_val, 128
+	jge		_start_by8
+
+_do_n_blocks:
+	cmp		N_val, 0
+	je		_ret_
+
+	cmp		N_val, (7*16)
+	jge		_remaining_num_blocks_is_7
+
+	cmp		N_val, (6*16)
+	jge		_remaining_num_blocks_is_6
+
+	cmp		N_val, (5*16)
+	jge		_remaining_num_blocks_is_5
+
+	cmp		N_val, (4*16)
+	jge		_remaining_num_blocks_is_4
+
+	cmp		N_val, (3*16)
+	jge		_remaining_num_blocks_is_3
+
+	cmp		N_val, (2*16)
+	jge		_remaining_num_blocks_is_2
+
+	cmp		N_val, (1*16)
+	jge		_remaining_num_blocks_is_1
+
+;; _remaining_num_blocks_is_0:
+	vmovdqa		xmm8, xmm0
+	vmovdqa		xmm0, xmm9
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_7:
+	mov		tmp1, -1
+	shr		tmp1, 16
+	kmovq		k1, tmp1
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2 {k1}, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*7
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4] {k1}, zmm2
+	add		ptr_ciphertext, 16*7
+
+	vextracti32x4	xmm8, zmm2, 0x2
+	vextracti32x4	xmm0, zmm10, 0x3
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_6:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	ymm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*6
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], ymm2
+	add		ptr_ciphertext, 16*6
+
+	vextracti32x4	xmm8, zmm2, 0x1
+	vextracti32x4	xmm0, zmm10, 0x2
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_5:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 16*5
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu		[ptr_ciphertext+16*4], xmm2
+	add		ptr_ciphertext, 16*5
+
+	movdqa		xmm8, xmm2
+	vextracti32x4	xmm0, zmm10, 0x1
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_4:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	add		ptr_plaintext, 16*4
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 1
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	add		ptr_ciphertext, 16*4
+
+	vextracti32x4	xmm8, zmm1, 0x3
+	vextracti32x4	xmm0, zmm10, 0x0
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_3:
+	vextracti32x4	xmm10, zmm9, 1
+	vextracti32x4	xmm11, zmm9, 2
+	vmovdqu		xmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*1]
+	vmovdqu		xmm3, [ptr_plaintext+16*2]
+	add		ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	add		ptr_ciphertext, 16*3
+
+	vmovdqa		xmm8, xmm3
+	vextracti32x4	xmm0, zmm9, 3
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_2:
+	vextracti32x4	xmm10, zmm9, 1
+	vmovdqu		xmm1, [ptr_plaintext+16*0]
+	vmovdqu		xmm2, [ptr_plaintext+16*1]
+	add		ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	add		ptr_ciphertext, 16*2
+
+	vmovdqa		xmm8, xmm2
+	vextracti32x4	xmm0, zmm9, 2
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+_remaining_num_blocks_is_1:
+	vmovdqu		xmm1, [ptr_plaintext]
+	add		ptr_plaintext, 16
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16
+
+	vmovdqa		xmm8, xmm1
+	vextracti32x4	xmm0, zmm9, 1
+	and		N_val, 15
+	je		_ret_
+	jmp		_steal_cipher
+
+
+_start_by16:
+	; Make first 7 tweek values
+	vbroadcasti32x4	zmm0, [TW]
+	vbroadcasti32x4	zmm8, [shufb_15_7]
+	mov		tmp1, 0xaa
+	kmovq		k2, tmp1
+
+	; Mult tweak by 2^{3, 2, 1, 0}
+	vpshufb		zmm1, zmm0, zmm8		; mov 15->0, 7->8
+	vpsllvq		zmm4, zmm0, [const_dq3210]	; shift l 3,2,1,0
+	vpsrlvq		zmm2, zmm1, [const_dq5678]	; shift r 5,6,7,8
+	vpclmulqdq      zmm3, zmm2, zpoly, 0x00
+	vpxorq		zmm4 {k2}, zmm4, zmm2		; tweaks shifted by 3-0
+	vpxord		zmm9, zmm3, zmm4
+
+	; Mult tweak by 2^{7, 6, 5, 4}
+	vpsllvq		zmm5, zmm0, [const_dq7654]	; shift l 7,6,5,4
+	vpsrlvq		zmm6, zmm1, [const_dq1234]	; shift r 1,2,3,4
+	vpclmulqdq      zmm7, zmm6, zpoly, 0x00
+	vpxorq		zmm5 {k2}, zmm5, zmm6		; tweaks shifted by 7-4
+	vpxord		zmm10, zmm7, zmm5
+
+	; Make next 8 tweek values by all x 2^8
+	vpsrldq		zmm13, zmm9, 15
+	vpclmulqdq	zmm14, zmm13, zpoly, 0
+	vpslldq		zmm11, zmm9, 1
+	vpxord		zmm11, zmm11, zmm14
+
+	vpsrldq		zmm15, zmm10, 15
+	vpclmulqdq	zmm16, zmm15, zpoly, 0
+	vpslldq		zmm12, zmm10, 1
+	vpxord		zmm12, zmm12, zmm16
+
+_main_loop_run_16:
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2, [ptr_plaintext+16*4]
+	vmovdqu8	zmm3, [ptr_plaintext+16*8]
+	vmovdqu8	zmm4, [ptr_plaintext+16*12]
+	add		ptr_plaintext, 256
+
+	encrypt_by_16_zmm  zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
+
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], zmm2
+	vmovdqu8	[ptr_ciphertext+16*8], zmm3
+	vmovdqu8	[ptr_ciphertext+16*12], zmm4
+	add		ptr_ciphertext, 256
+	sub		N_val, 256
+
+	cmp		N_val, 256
+	jge		_main_loop_run_16
+
+	cmp		N_val, 128
+	jge		_main_loop_run_8
+
+	vextracti32x4	xmm0, zmm4, 0x3 ; keep last crypted block
+	jmp		_do_n_blocks
+
+_start_by8:
+	; Make first 7 tweek values
+	vbroadcasti32x4	zmm0, [TW]
+	vbroadcasti32x4	zmm8, [shufb_15_7]
+	mov		tmp1, 0xaa
+	kmovq		k2, tmp1
+
+	; Mult tweak by 2^{3, 2, 1, 0}
+	vpshufb		zmm1, zmm0, zmm8		; mov 15->0, 7->8
+	vpsllvq		zmm4, zmm0, [const_dq3210]	; shift l 3,2,1,0
+	vpsrlvq		zmm2, zmm1, [const_dq5678]	; shift r 5,6,7,8
+	vpclmulqdq      zmm3, zmm2, zpoly, 0x00
+	vpxorq		zmm4 {k2}, zmm4, zmm2		; tweaks shifted by 3-0
+	vpxord		zmm9, zmm3, zmm4
+
+	; Mult tweak by 2^{7, 6, 5, 4}
+	vpsllvq		zmm5, zmm0, [const_dq7654]	; shift l 7,6,5,4
+	vpsrlvq		zmm6, zmm1, [const_dq1234]	; shift r 1,2,3,4
+	vpclmulqdq      zmm7, zmm6, zpoly, 0x00
+	vpxorq		zmm5 {k2}, zmm5, zmm6		; tweaks shifted by 7-4
+	vpxord		zmm10, zmm7, zmm5
+
+_main_loop_run_8:
+	; load plaintext
+	vmovdqu8	zmm1, [ptr_plaintext+16*0]
+	vmovdqu8	zmm2, [ptr_plaintext+16*4]
+	add		ptr_plaintext, 128
+
+	encrypt_by_eight_zmm  zmm1, zmm2, zmm9, zmm10, zmm0, 0
+
+	; store ciphertext
+	vmovdqu8	[ptr_ciphertext+16*0], zmm1
+	vmovdqu8	[ptr_ciphertext+16*4], zmm2
+	add		ptr_ciphertext, 128
+	sub		N_val, 128
+
+	cmp		N_val, 128
+	jge		_main_loop_run_8
+
+	vextracti32x4	xmm0, zmm2, 0x3 ; keep last crypted block
+	jmp		_do_n_blocks
+
+_steal_cipher_next:
+	; generate next Tweak value
+	xor		ghash_poly_8b_temp, ghash_poly_8b_temp
+	shl		twtempl, 1
+	adc		twtemph, twtemph
+	cmovc		ghash_poly_8b_temp, ghash_poly_8b
+	xor		twtempl, ghash_poly_8b_temp
+	mov		[TW], twtempl
+	mov		[TW + 8], twtemph
+	vmovdqa		xmm0, [TW]
+
+_steal_cipher:
+	; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
+	vmovdqa		xmm2, xmm8
+
+	; shift xmm8 to the left by 16-N_val bytes
+	lea		twtempl, [vpshufb_shf_table]
+	vmovdqu		xmm10, [twtempl+N_val]
+	vpshufb		xmm8, xmm10
+
+	vmovdqu		xmm3, [ptr_plaintext - 16 + N_val]
+	vmovdqu		[ptr_ciphertext - 16 + N_val], xmm8
+
+	; shift xmm3 to the right by 16-N_val bytes
+	lea		twtempl, [vpshufb_shf_table +16]
+	sub		twtempl, N_val
+	vmovdqu		xmm10, [twtempl]
+	vpxor		xmm10, [mask1]
+	vpshufb		xmm3, xmm10
+
+	vpblendvb	xmm3, xmm3, xmm2, xmm10
+
+	; xor Tweak value
+	vpxor		xmm8, xmm3, xmm0
+
+	;encrypt last block with cipher stealing
+	vpxor		xmm8, [keys]		; ARK
+	vaesenc		xmm8, [keys + 16*1]	; round 1
+	vaesenc		xmm8, [keys + 16*2]	; round 2
+	vaesenc		xmm8, [keys + 16*3]	; round 3
+	vaesenc		xmm8, [keys + 16*4]	; round 4
+	vaesenc		xmm8, [keys + 16*5]	; round 5
+	vaesenc		xmm8, [keys + 16*6]	; round 6
+	vaesenc		xmm8, [keys + 16*7]	; round 7
+	vaesenc		xmm8, [keys + 16*8]	; round 8
+	vaesenc		xmm8, [keys + 16*9]	; round 9
+	vaesenc		xmm8, [keys + 16*10]	; round 10
+	vaesenc		xmm8, [keys + 16*11]	; round 11
+	vaesenc		xmm8, [keys + 16*12]	; round 12
+	vaesenc		xmm8, [keys + 16*13]	; round 13
+	vaesenclast	xmm8, [keys + 16*14]	; round 14
+
+	; xor Tweak value
+	vpxor		xmm8, xmm8, xmm0
+
+	; store last ciphertext value
+	vmovdqu		[ptr_ciphertext - 16], xmm8
+
+_ret_:
+	mov		rbx, [_gpr + 8*0]
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov		rdi, [_gpr + 8*1]
+	mov		rsi, [_gpr + 8*2]
+
+	vmovdqa		xmm6, [_xmm + 16*0]
+	vmovdqa		xmm7, [_xmm + 16*1]
+	vmovdqa		xmm8, [_xmm + 16*2]
+	vmovdqa		xmm9, [_xmm + 16*3]
+	vmovdqa		xmm10, [_xmm + 16*4]
+	vmovdqa		xmm11, [_xmm + 16*5]
+	vmovdqa		xmm12, [_xmm + 16*6]
+	vmovdqa		xmm13, [_xmm + 16*7]
+	vmovdqa		xmm14, [_xmm + 16*8]
+	vmovdqa		xmm15, [_xmm + 16*9]
+%endif
+
+%ifndef ALIGN_STACK
+	add		rsp, VARIABLE_OFFSET
+%else
+	mov		rsp, rbp
+	pop		rbp
+%endif
+	ret
+
+
+_less_than_128_bytes:
+	cmp		N_val, 16
+	jb		_ret_
+
+	mov		tmp1, N_val
+	and		tmp1, (7 << 4)
+	cmp		tmp1, (6 << 4)
+	je		_num_blocks_is_6
+	cmp		tmp1, (5 << 4)
+	je		_num_blocks_is_5
+	cmp		tmp1, (4 << 4)
+	je		_num_blocks_is_4
+	cmp		tmp1, (3 << 4)
+	je		_num_blocks_is_3
+	cmp		tmp1, (2 << 4)
+	je		_num_blocks_is_2
+	cmp		tmp1, (1 << 4)
+	je		_num_blocks_is_1
+
+_num_blocks_is_7:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+	add		ptr_plaintext, 16*7
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+	vmovdqu		[ptr_ciphertext+16*5], xmm6
+	vmovdqu		[ptr_ciphertext+16*6], xmm7
+	add		ptr_ciphertext, 16*7
+	vmovdqa		xmm8, xmm7
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+_num_blocks_is_6:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+	add		ptr_plaintext, 16*6
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+	vmovdqu		[ptr_ciphertext+16*5], xmm6
+
+	add		ptr_ciphertext, 16*6
+	vmovdqa		xmm8, xmm6
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+_num_blocks_is_5:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+	add		ptr_plaintext, 16*5
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+	vmovdqu		[ptr_ciphertext+16*4], xmm5
+
+	add		ptr_ciphertext, 16*5
+	vmovdqa		xmm8, xmm5
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+_num_blocks_is_4:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+	add		ptr_plaintext, 16*4
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+	vmovdqu		[ptr_ciphertext+16*3], xmm4
+
+	add		ptr_ciphertext, 16*4
+	vmovdqa		xmm8, xmm4
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+_num_blocks_is_3:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+	add		ptr_plaintext, 16*3
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext+16*0], xmm1
+	vmovdqu		[ptr_ciphertext+16*1], xmm2
+	vmovdqu		[ptr_ciphertext+16*2], xmm3
+
+	add		ptr_ciphertext, 16*3
+	vmovdqa		xmm8, xmm3
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+
+_num_blocks_is_2:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+	add		ptr_plaintext, 16*2
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+	; store ciphertext
+	vmovdqu		[ptr_ciphertext], xmm1
+	vmovdqu		[ptr_ciphertext+16], xmm2
+
+	add		ptr_ciphertext, 16*2
+	vmovdqa		xmm8, xmm2
+
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+
+_num_blocks_is_1:
+	initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+	add		ptr_plaintext, 16*1
+	encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+	vmovdqu		[ptr_ciphertext], xmm1
+	add		ptr_ciphertext, 16*1
+	vmovdqa		xmm8, xmm1
+	and		N_val, 15               ; N_val = N_val mod 16
+	je		_ret_
+	jmp		_steal_cipher_next
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
+;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
+;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
+;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
+;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
+;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
+;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
+;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
+;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
+const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
+const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
+const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
+
+shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+%else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_XTS_AES_256_enc_vaes
+no_XTS_AES_256_enc_vaes:
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_128.S
new file mode 100644
index 000000000..7214f0f25
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_128.S
@@ -0,0 +1,215 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "gcm_common_128.S"
+/*
+    void gist_aes_gcm_enc_finalize_##mode(                              \
+        const struct gcm_key_data *key_data,                            \
+        struct gcm_context_data *context,                               \
+        uint8_t *auth_tag,                                              \
+        uint64_t auth_tag_len                                           \
+    )
+*/
+    declare_var_generic_reg key_data            ,0
+    declare_var_generic_reg context             ,1
+    declare_var_generic_reg auth_tag            ,2
+    declare_var_generic_reg auth_tag_len        ,3
+    declare_var_generic_reg partial_block_len   ,4
+    declare_var_generic_reg partial_block       ,1
+
+    declare_var_generic_reg hashkey_addr        ,0
+    declare_var_generic_reg temp0,        6
+
+    declare_var_vector_reg  OrigIV          ,0
+    declare_var_vector_reg  AadHash         ,1
+    declare_var_vector_reg  HashKey0        ,2
+    declare_var_vector_reg  HashKey0Ext     ,3
+    declare_var_vector_reg  High            ,4
+    declare_var_vector_reg  Low             ,5
+    declare_var_vector_reg  Middle0         ,6
+    declare_var_vector_reg  Len             ,7
+    declare_var_vector_reg  Tmp0            ,8
+    declare_var_vector_reg  Tmp1            ,9
+    declare_var_vector_reg  Zero            ,10
+    declare_var_vector_reg  Poly            ,11
+    declare_var_vector_reg  PartitialBlock  ,13
+
+    declare_var_vector_reg  Tmp2            ,31
+    declare_var_vector_reg  Tmp3            ,12
+
+    .set        stack_size,48
+    .macro  push_stack
+        stp      d8, d9,[sp,-stack_size]!
+        stp     d10,d11,[sp,16]
+        stp     d12,d13,[sp,32]
+    .endm
+
+    .macro  pop_stack
+        ldp     d10,d11,[sp,16]
+        ldp     d12,d13,[sp,32]
+        ldp     d8, d9, [sp], stack_size
+    .endm
+START_FUNC(enc,KEY_LEN,_finalize_)
+START_FUNC(dec,KEY_LEN,_finalize_)
+    ldr             partial_block_len,[context,PARTIAL_BLOCK_LENGTH_OFF]
+    load_aes_keys   key_data
+    push_stack
+    /* Init Consts for ghash  */
+    movi            vZero.4s,0
+    mov             temp0,0x87
+    dup             vPoly.2d,temp0
+    ldr             qOrigIV,[context,ORIG_IV_OFF]                           /*  OrigIV */
+    ldp             qAadHash,qLen,[context],PARTIAL_BLOCK_ENC_KEY_OFF       /*  Len  , context move to partial block*/
+    /* Init Consts for ghash  */
+    movi            vZero.4s,0
+    mov             temp0,0x87
+    dup             vPoly.2d,temp0
+    /* complete part          */
+    cbnz         partial_block_len,10f
+    ldp         qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-2)*32]
+    aes_encrypt_round   OrigIV,Key0
+    pmull2      vHigh.1q,vAadHash.2d,vHashKey0.2d
+    aes_encrypt_round   OrigIV,Key1
+    pmull       vLow.1q ,vAadHash.1d,vHashKey0.1d
+    shl         vLen.2d,vLen.2d,3                                   /*  Len  */
+    aes_encrypt_round   OrigIV,Key2
+    pmull       vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d
+    rev64       vLen.16b,vLen.16b                                   /*  Len  */
+    aes_encrypt_round   OrigIV,Key3
+    pmull2      vTmp0.1q   ,vAadHash.2d,vHashKey0Ext.2d
+    rbit        vAadHash.16b,vLen.16b                               /*  Len  */
+    ldp         qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-1)*32]
+    aes_encrypt_round   OrigIV,Key4
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+    aes_encrypt_round   OrigIV,Key5
+    pmull2      vTmp0.1q ,vAadHash.2d,vHashKey0.2d
+    aes_encrypt_round   OrigIV,Key6
+    pmull       vTmp1.1q ,vAadHash.1d,vHashKey0.1d
+    aes_encrypt_round   OrigIV,Key7
+    eor         vHigh.16b,vHigh.16b,vTmp0.16b
+    eor         vLow.16b ,vLow.16b ,vTmp1.16b
+    pmull2      vTmp2.1q ,vAadHash.2d,vHashKey0Ext.2d
+    aes_encrypt_round   OrigIV,Key8
+    pmull       vTmp3.1q ,vAadHash.1d,vHashKey0Ext.1d
+    aese        vOrigIV.16b,vKey9.16b
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp2.16b
+    eor         vOrigIV.16b,vOrigIV.16b,vKey10.16b
+    rbit        vAadHash.16b,vOrigIV.16b
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp3.16b
+    ghash_mult_final_round  AadHash,High,Low,Middle0,Tmp0,Zero,Poly
+
+    rbit        vAadHash.16b,vAadHash.16b                           /* Aad */
+    /* output auth_tag */
+    cmp         auth_tag_len,16
+    bne         1f
+    /* most likely auth_tag_len=16 */
+    str         qAadHash,[auth_tag]
+    pop_stack
+    ret
+1:  /* auth_tag_len=12 */
+    cmp         auth_tag_len,12
+    bne         1f
+    str         dAadHash,[auth_tag],8
+    st1         {vAadHash.s}[2],[auth_tag]
+    pop_stack
+    ret
+1:  /* auth_tag_len=8 */
+    str         dAadHash,[auth_tag]
+    pop_stack
+    ret
+
+10:  /* cbnz         partial_block_len,10f */
+    ldp         qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-3)*32]
+    aes_encrypt_round   OrigIV,Key0
+    read_small_data_start PartitialBlock,partial_block,partial_block_len,temp0,Tmp0
+    pmull2      vHigh.1q,vAadHash.2d,vHashKey0.2d
+    aes_encrypt_round   OrigIV,Key1
+    pmull       vLow.1q ,vAadHash.1d,vHashKey0.1d
+    aes_encrypt_round   OrigIV,Key2
+    pmull       vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d
+    aes_encrypt_round   OrigIV,Key3
+    pmull2      vTmp0.1q   ,vAadHash.2d,vHashKey0Ext.2d
+    aes_encrypt_round   OrigIV,Key4
+    rbit        vAadHash.16b,vPartitialBlock.16b
+    ldp         qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-2)*32]
+    aes_encrypt_round   OrigIV,Key5
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+    pmull2      vTmp0.1q,vAadHash.2d,vHashKey0.2d
+    aes_encrypt_round   OrigIV,Key6
+    shl         vLen.2d,vLen.2d,3                                   /*  Len  */
+    pmull       vTmp1.1q ,vAadHash.1d,vHashKey0.1d
+    eor         vHigh.16b,vHigh.16b,vTmp0.16b
+    aes_encrypt_round   OrigIV,Key7
+    eor         vLow.16b,vLow.16b,vTmp1.16b
+    pmull2      vTmp0.1q   ,vAadHash.2d,vHashKey0Ext.2d
+    rev64       vLen.16b,vLen.16b                                   /*  Len  */
+    aes_encrypt_round   OrigIV,Key8
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+    aese        vOrigIV.16b,vKey9.16b
+    pmull       vTmp0.1q,vAadHash.1d,vHashKey0Ext.1d
+    rbit        vAadHash.16b,vLen.16b                               /*  Len  */
+    ldp         qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-1)*32]
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+    eor         vOrigIV.16b,vOrigIV.16b,vKey10.16b
+    pmull2      vTmp0.1q ,vAadHash.2d,vHashKey0.2d
+    pmull       vTmp1.1q ,vAadHash.1d,vHashKey0.1d
+    eor         vHigh.16b,vHigh.16b,vTmp0.16b
+    eor         vLow.16b ,vLow.16b ,vTmp1.16b
+    pmull2      vTmp2.1q ,vAadHash.2d,vHashKey0Ext.2d
+    pmull       vTmp3.1q ,vAadHash.1d,vHashKey0Ext.1d
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp2.16b
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp3.16b
+    rbit        vAadHash.16b,vOrigIV.16b
+    ghash_mult_final_round  AadHash,High,Low,Middle0,Tmp0,Zero,Poly
+
+    rbit        vAadHash.16b,vAadHash.16b                           /* Aad */
+    /* output auth_tag */
+    cmp         auth_tag_len,16
+    bne         1f
+    /* most likely auth_tag_len=16 */
+    str         qAadHash,[auth_tag]
+    pop_stack
+    ret
+1:  /* auth_tag_len=12 */
+    cmp         auth_tag_len,12
+    bne         1f
+    str         dAadHash,[auth_tag],8
+    st1         {vAadHash.s}[2],[auth_tag]
+    pop_stack
+    ret
+1:  /* auth_tag_len=8 */
+    str         dAadHash,[auth_tag]
+    pop_stack
+    ret
+
+END_FUNC(enc,KEY_LEN,_finalize_)
+END_FUNC(dec,KEY_LEN,_finalize_)
+
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_256.S
new file mode 100644
index 000000000..9eda7178e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_256.S
@@ -0,0 +1,220 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "gcm_common_256.S"
+/*
+    void gist_aes_gcm_enc_finalize_##mode(                              \
+        const struct gcm_key_data *key_data,                            \
+        struct gcm_context_data *context,                               \
+        uint8_t *auth_tag,                                              \
+        uint64_t auth_tag_len                                           \
+    )
+*/
+    declare_var_generic_reg key_data            ,0
+    declare_var_generic_reg context             ,1
+    declare_var_generic_reg auth_tag            ,2
+    declare_var_generic_reg auth_tag_len        ,3
+    declare_var_generic_reg partial_block_len   ,4
+    declare_var_generic_reg partial_block       ,1
+
+    declare_var_generic_reg hashkey_addr        ,0
+    declare_var_generic_reg temp0               ,6
+
+    declare_var_vector_reg  OrigIV              ,0
+    declare_var_vector_reg  AadHash             ,1
+    declare_var_vector_reg  HashKey0            ,2
+    declare_var_vector_reg  HashKey0Ext         ,3
+    declare_var_vector_reg  High                ,4
+    declare_var_vector_reg  Low                 ,5
+    declare_var_vector_reg  Middle0             ,6
+    declare_var_vector_reg  Len                 ,7
+    declare_var_vector_reg  Tmp0                ,8
+    declare_var_vector_reg  Tmp1                ,9
+    declare_var_vector_reg  Zero                ,10
+    declare_var_vector_reg  Poly                ,11
+    declare_var_vector_reg  PartitialBlock      ,13
+
+    declare_var_vector_reg  Tmp2                ,31
+    declare_var_vector_reg  Tmp3                ,12
+
+    .set        stack_size,48
+    .macro  push_stack
+        stp      d8, d9,[sp,-stack_size]!
+        stp     d10,d11,[sp,16]
+        stp     d12,d13,[sp,32]
+    .endm
+    .macro  pop_stack
+        ldp     d10,d11,[sp,16]
+        ldp     d12,d13,[sp,32]
+        ldp     d8, d9, [sp], stack_size
+    .endm
+
+START_FUNC(enc,KEY_LEN,_finalize_)
+START_FUNC(dec,KEY_LEN,_finalize_)
+    ldr             partial_block_len,[context,PARTIAL_BLOCK_LENGTH_OFF]
+    load_aes_keys   key_data
+    push_stack
+
+    ldr             qOrigIV,[context,ORIG_IV_OFF]                           /*  OrigIV */
+    ldp             qAadHash,qLen,[context],PARTIAL_BLOCK_ENC_KEY_OFF       /*  Len  , context move to partial block*/
+    /* Init Consts for ghash  */
+    movi            vZero.4s,0
+    mov             temp0,0x87
+    dup             vPoly.2d,temp0
+    /* complete part */
+    cbnz         partial_block_len,10f
+    ldp         qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-2)*32]
+    aes_encrypt_round   OrigIV,Key0
+    pmull2      vHigh.1q,vAadHash.2d,vHashKey0.2d
+    aes_encrypt_round   OrigIV,Key1
+    pmull       vLow.1q ,vAadHash.1d,vHashKey0.1d
+    shl         vLen.2d,vLen.2d,3                                   /*  Len  */
+    aes_encrypt_round   OrigIV,Key2
+    pmull       vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d
+    rev64       vLen.16b,vLen.16b                                   /*  Len  */
+    aes_encrypt_round   OrigIV,Key3
+    pmull2      vTmp0.1q   ,vAadHash.2d,vHashKey0Ext.2d
+    rbit        vAadHash.16b,vLen.16b                               /*  Len  */
+    ldp         qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-1)*32]
+    aes_encrypt_round   OrigIV,Key4
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+    aes_encrypt_round   OrigIV,Key5
+    pmull2      vTmp0.1q ,vAadHash.2d,vHashKey0.2d
+    aes_encrypt_round   OrigIV,Key6
+    pmull       vTmp1.1q ,vAadHash.1d,vHashKey0.1d
+    aes_encrypt_round   OrigIV,Key7
+    eor         vHigh.16b,vHigh.16b,vTmp0.16b
+    eor         vLow.16b ,vLow.16b ,vTmp1.16b
+    pmull2      vTmp2.1q ,vAadHash.2d,vHashKey0Ext.2d
+    aes_encrypt_round   OrigIV,Key8
+    pmull       vTmp3.1q ,vAadHash.1d,vHashKey0Ext.1d
+    aes_encrypt_round   OrigIV,Key9
+    aes_encrypt_round   OrigIV,Key10
+    aes_encrypt_round   OrigIV,Key11
+    aes_encrypt_round   OrigIV,Key12
+    aese        vOrigIV.16b,vKey13.16b
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp2.16b
+    eor         vOrigIV.16b,vOrigIV.16b,vKey14.16b
+    rbit        vAadHash.16b,vOrigIV.16b
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp3.16b
+    ghash_mult_final_round  AadHash,High,Low,Middle0,Tmp0,Zero,Poly
+
+    rbit        vAadHash.16b,vAadHash.16b                           /* Aad */
+    /* output auth_tag */
+    cmp         auth_tag_len,16
+    bne         1f
+    /* most likely auth_tag_len=16 */
+    str         qAadHash,[auth_tag]
+    pop_stack
+    ret
+1:  /* auth_tag_len=12 */
+    cmp         auth_tag_len,12
+    bne         1f
+    str         dAadHash,[auth_tag],8
+    st1         {vAadHash.s}[2],[auth_tag]
+    pop_stack
+    ret
+1:  /* auth_tag_len=8 */
+    str         dAadHash,[auth_tag]
+    pop_stack
+    ret
+
+10:  /* cbnz         partial_block_len,10f */
+    ldp         qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-3)*32]
+    aes_encrypt_round   OrigIV,Key0
+    read_small_data_start PartitialBlock,partial_block,partial_block_len,temp0,Tmp0
+    pmull2      vHigh.1q,vAadHash.2d,vHashKey0.2d
+    aes_encrypt_round   OrigIV,Key1
+    pmull       vLow.1q ,vAadHash.1d,vHashKey0.1d
+    aes_encrypt_round   OrigIV,Key2
+    pmull       vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d
+    aes_encrypt_round   OrigIV,Key3
+    pmull2      vTmp0.1q   ,vAadHash.2d,vHashKey0Ext.2d
+    aes_encrypt_round   OrigIV,Key4
+    rbit        vAadHash.16b,vPartitialBlock.16b
+    ldp         qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-2)*32]
+    aes_encrypt_round   OrigIV,Key5
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+    pmull2      vTmp0.1q,vAadHash.2d,vHashKey0.2d
+    aes_encrypt_round   OrigIV,Key6
+    shl         vLen.2d,vLen.2d,3                                   /*  Len  */
+    pmull       vTmp1.1q ,vAadHash.1d,vHashKey0.1d
+    eor         vHigh.16b,vHigh.16b,vTmp0.16b
+    aes_encrypt_round   OrigIV,Key7
+    eor         vLow.16b,vLow.16b,vTmp1.16b
+    pmull2      vTmp0.1q   ,vAadHash.2d,vHashKey0Ext.2d
+    rev64       vLen.16b,vLen.16b                                   /*  Len  */
+    aes_encrypt_round   OrigIV,Key8
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+    pmull       vTmp0.1q,vAadHash.1d,vHashKey0Ext.1d
+    aes_encrypt_round   OrigIV,Key9
+    rbit        vAadHash.16b,vLen.16b                               /*  Len  */
+    ldp         qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-1)*32]
+    aes_encrypt_round   OrigIV,Key10
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+    aes_encrypt_round   OrigIV,Key11
+    pmull2      vTmp0.1q ,vAadHash.2d,vHashKey0.2d
+    aes_encrypt_round   OrigIV,Key12
+    pmull       vTmp1.1q ,vAadHash.1d,vHashKey0.1d
+    aese        vOrigIV.16b,vKey13.16b
+    eor         vHigh.16b,vHigh.16b,vTmp0.16b
+    eor         vLow.16b ,vLow.16b ,vTmp1.16b
+    pmull2      vTmp2.1q ,vAadHash.2d,vHashKey0Ext.2d
+    pmull       vTmp3.1q ,vAadHash.1d,vHashKey0Ext.1d
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp2.16b
+    eor         vOrigIV.16b,vOrigIV.16b,vKey14.16b
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp3.16b
+    rbit        vAadHash.16b,vOrigIV.16b
+    ghash_mult_final_round  AadHash,High,Low,Middle0,Tmp0,Zero,Poly
+
+    rbit        vAadHash.16b,vAadHash.16b                           /* Aad */
+    /* output auth_tag */
+    cmp         auth_tag_len,16
+    bne         1f
+    /* most likely auth_tag_len=16 */
+    str         qAadHash,[auth_tag]
+    pop_stack
+    ret
+1:  /* auth_tag_len=12 */
+    cmp         auth_tag_len,12
+    bne         1f
+    str         dAadHash,[auth_tag],8
+    st1         {vAadHash.s}[2],[auth_tag]
+    pop_stack
+    ret
+1:  /* auth_tag_len=8 */
+    str         dAadHash,[auth_tag]
+    pop_stack
+    ret
+
+END_FUNC(enc,KEY_LEN,_finalize_)
+END_FUNC(dec,KEY_LEN,_finalize_)
+
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_init.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_init.S
new file mode 100644
index 000000000..0dd94c6b7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_init.S
@@ -0,0 +1,161 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "gcm_common.S"
+/*
+void gist_aes_gcm_init_##mode(
+    const struct gcm_key_data *key_data,
+    struct gcm_context_data *context,
+    uint8_t *iv,
+    uint8_t const *aad,
+    uint64_t aad_len
+    );
+*/
+    key_data        .req    x0
+    context         .req    x1
+    iv              .req    x2
+    aad             .req    x3
+    aad_len         .req    x4
+    temp0           .req    x7
+    wtemp0          .req    w7
+    temp1           .req    x6
+    left_len        .req    x5
+    aad_left        .req    x2
+    small_tbl_adr   .req    x6
+
+    hashkey_base    .req    x0
+    hashkey_addr    .req    x2
+
+    declare_var_vector_reg  AadHash,0
+    declare_var_vector_reg  Dat0,1
+    declare_var_vector_reg  HashKey0,2
+    declare_var_vector_reg  HashKey0Ext,3
+    declare_var_vector_reg  High,4
+    declare_var_vector_reg  Middle0,5
+    declare_var_vector_reg  Low,6
+    declare_var_vector_reg  LeftDat,7
+    declare_var_vector_reg  Zero,16
+    declare_var_vector_reg  Poly,17
+
+    declare_var_vector_reg  Tmp0,18
+    declare_var_vector_reg  Tmp1,19
+    declare_var_vector_reg  Ctr,1
+
+
+START_FUNC(init,128,_)
+START_FUNC(init,192,_)
+START_FUNC(init,256,_)
+    stp             aad_len,xzr,[context,AAD_LEN_OFF]           //save in_length and aad_length
+    str             xzr,[context,PARTIAL_BLOCK_LENGTH_OFF]      //clear partial_block_length
+    add             hashkey_base,key_data,HASHKEY_BASE_OFF
+    /* Init Consts for ghash  */
+    movi            vZero.4s,0
+    mov             temp0,0x87
+    dup             vPoly.2d,temp0
+    /* Set orig_IV */
+    ldr             wtemp0,[iv,8]
+    ldr             temp1,[iv]
+    movk            temp0,0x100,lsl 48
+    stp             temp1,temp0,[context,ORIG_IV_OFF]
+    and             left_len,aad_len,15
+    ldp             qHashKey0,qHashKey0Ext,[key_data,(HASHKEY_TOTAL_NUM-1)*32]
+    /* Set current_counter, save as cpu order */
+    ldr             qCtr,[context,ORIG_IV_OFF]
+    rev32           vCtr.16b,vCtr.16b
+    str             qCtr,[context,CTR_OFF]
+    cbz             aad_len,init_zero_exit
+    lsr             aad_len,aad_len,4
+    /* Read small data */
+    cbz             left_len,2f
+    add             aad_left,aad,aad_len,lsl 4
+    read_small_data_start   LeftDat,aad_left,left_len,small_tbl_adr,Tmp0
+    cbz             aad_len,24f     //  aad_len less than 16
+2:
+    cbnz            left_len,1f
+    /*left_len == 0 && aad_len !=0 */
+
+    sub             aad_len,aad_len,1
+    /*  leftDat = aad[-1] */
+    ldr             qLeftDat,[aad,aad_len,lsl 4]
+    cbz             aad_len,24f         /* aad_len == 16 */
+1:
+    /* aad_len > 16 */
+    ldr             qAadHash,[aad],16
+    rbit            vAadHash.16b,vAadHash.16b
+    sub             aad_len,aad_len,1
+1:
+    /* loop ghash_block */
+    cmp             aad_len,HASHKEY_TOTAL_NUM - 1
+    bls             1f /* break loop */
+    sub             aad_len,aad_len,HASHKEY_TOTAL_NUM
+    ghash_block_n   HASHKEY_TOTAL_NUM,AadHash,Dat0,aad,hashkey_addr,hashkey_base,    \
+        HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly ,      \
+        Tmp0,Tmp1
+    b               1b /* back to loop start */
+1:
+    cbz             aad_len,23f     /* left aad_len == 0 */
+    mov             temp0,HASHKEY_TOTAL_NUM - 1
+    sub             temp0,temp0,aad_len
+    add             hashkey_addr,hashkey_base,temp0,lsl 5
+    sub             aad_len,aad_len,1
+
+
+    ghash_mult_init_round   AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext,  \
+        High,Low,Middle0,Tmp0,Dat0,2        /* load next hash */
+1:
+    cbz             aad_len,1f
+    ghash_mult_round        AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \
+        High,Low,Middle0,Tmp0,Tmp1,Dat0, 2
+
+    sub             aad_len,aad_len,1
+    b               1b
+1:
+    ghash_mult_round_noload AadHash,HashKey0,HashKey0Ext,High,Low,Middle0,Tmp0,Tmp1
+    rbit                    vAadHash.16b, vLeftDat.16b
+    ghash_mult_final_round  AadHash,High,Low,Middle0,Tmp0,Zero,Poly
+    str                     qAadHash,[context]
+    ret
+
+23:
+    ghash_block_reg AadHash,LeftDat,               \
+        HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly ,       \
+        Tmp0
+    str             qAadHash,[context]
+    ret
+24: /* less or equal than 16 */
+    rbit            vLeftDat.16b, vLeftDat.16b
+    str             qLeftDat,[context]
+    ret
+init_zero_exit:
+    stp             xzr,xzr,[context]
+    ret
+END_FUNC(init,128,_)
+END_FUNC(init,192,_)
+END_FUNC(init,256,_)
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_consts.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_consts.S
new file mode 100644
index 000000000..c4e8ef59c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_consts.S
@@ -0,0 +1,140 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+    .arch armv8-a
+    .section	.rodata
+#define CONST_VAR_START(a)    \
+    .align	3;.global	a;.type	a, %object;a
+
+#define CONST_VAR_END(a)    \
+    .size   a,. - a
+CONST_VAR_START(shift_small_data_table):
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+CONST_VAR_START(read_small_data_table):
+    .byte      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15
+CONST_VAR_END(shift_small_data_table)
+    .byte   0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+    .byte   0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+    .byte   0x0c,0x0d,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+    .byte   0x08,0x09,0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+    .byte   0x08,0x09,0x0a,0x0b,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+    .byte   0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+    .byte   0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+    .byte   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+    .byte   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+    .byte   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff
+    .byte   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d,0x0e,0xff,0xff,0xff,0xff,0xff
+    .byte   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0xff,0xff,0xff,0xff
+    .byte   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0e,0xff,0xff,0xff
+    .byte   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff
+CONST_VAR_START(write_small_data_table):
+    .byte   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff
+CONST_VAR_END(read_small_data_table)
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0xff
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0xff,0xff
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0xff
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0xff,0xff,0xff,0xff
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0xff,0xff,0x04,0xff
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0xff,0xff
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0xff
+    .byte   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+    .byte   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0xff
+    .byte   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0x08,0x09,0xff,0xff
+    .byte   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0xff
+    .byte   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0xff,0xff,0xff,0xff
+    .byte   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0xff,0xff,0x0c,0xff
+    .byte   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff
+CONST_VAR_START(read_end_small_data_table):
+    .byte   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff
+CONST_VAR_END(write_small_data_table)
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0e
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d,0x0e
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0e
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0c,0x0d
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0e
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d
+    .byte   0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d,0x0e
+    .byte   0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b
+    .byte   0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0e
+    .byte   0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d
+CONST_VAR_START(write_end_small_data_table):
+    .byte   0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e
+CONST_VAR_END(read_end_small_data_table)
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0f,0xff
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0e,0x0f,0xff,0xff
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0d,0x0e,0x0f,0xff
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0b,0x0c,0x0d,0x0e,0xff,0xff,0x0f,0xff
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff
+    .byte   0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+    .byte   0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0x0f,0xff
+    .byte   0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff,0x0e,0x0f,0xff,0xff
+    .byte   0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0xff,0xff,0xff,0xff,0x0d,0x0e,0x0f,0xff
+    .byte   0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
+    .byte   0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff,0xff,0x0f,0xff
+    .byte   0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff
+CONST_VAR_START(tbx_end_small_data_table):
+    .byte   0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff
+CONST_VAR_END(write_end_small_data_table)
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+CONST_VAR_START(tbx_start_small_data_table):
+    .byte   0xff,0xff,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+CONST_VAR_END(tbx_end_small_data_table)
+    .byte   0xff,0xff,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0b,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0d,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0e,0x0f
+    .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0f
+CONST_VAR_END(tbx_start_small_data_table)
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_128.S
new file mode 100644
index 000000000..9f1ff80fb
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_128.S
@@ -0,0 +1,30 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "gcm_common_128.S"
+#include "gcm_enc_dec.S"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_256.S
new file mode 100644
index 000000000..f3cc2b802
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_256.S
@@ -0,0 +1,30 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "gcm_common_256.S"
+#include "gcm_enc_dec.S"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_128.S
new file mode 100644
index 000000000..e635d7e70
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_128.S
@@ -0,0 +1,30 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "gcm_common_128.S"
+#include "gcm_precomp.S"
+\ No newline at end of file
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_256.S
new file mode 100644
index 000000000..52b76a6a2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_256.S
@@ -0,0 +1,30 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "gcm_common_256.S"
+#include "gcm_precomp.S"
+\ No newline at end of file
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_128.S
new file mode 100644
index 000000000..42c48d9a0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_128.S
@@ -0,0 +1,32 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "gcm_common_128.S"
+#include "gcm_update.S"
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_256.S
new file mode 100644
index 000000000..1c2c33b48
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_256.S
@@ -0,0 +1,32 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "gcm_common_256.S"
+#include "gcm_update.S"
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_aarch64_dispatcher.c
new file mode 100644
index 000000000..1a2077356
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_aarch64_dispatcher.c
@@ -0,0 +1,108 @@
+/**********************************************************************
+  Copyright(c) 2020-2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+#undef PROVIDER_BASIC
+#define PROVIDER_BASIC(a) (void*)0
+
+static unsigned long is_crypto_available(void)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	return (auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES);
+}
+
+#define DEFINE_CBC_INTERFACE_DISPATCHER(func,mode,suffix)               \
+    DEFINE_INTERFACE_DISPATCHER(aes_cbc_##func##_##mode)                \
+    {                                                                   \
+        if (is_crypto_available())                                      \
+            return PROVIDER_INFO(aes_cbc_##func##_##mode##_##suffix);   \
+        return PROVIDER_BASIC(aes_cbc_##func##_##mode);                 \
+    }
+
+DEFINE_CBC_INTERFACE_DISPATCHER(enc, 128, aes);
+DEFINE_CBC_INTERFACE_DISPATCHER(enc, 192, aes);
+DEFINE_CBC_INTERFACE_DISPATCHER(enc, 256, aes);
+
+/*
+ * AES-CBC decryption can be parallelised according to algorithm. Decryption
+ * flow is to do decrypt and then EOR previous input data or IV(first).
+ * So, decryption can be parallelised and EOR all data as output data.
+ *
+ * The unroll factor depends on micro architecture. The factors of N1, A57 and A72
+ * are based on optimization guide and test results. Other platforms are based on
+ * ThunderX2  test results.
+ *
+ */
+DEFINE_INTERFACE_DISPATCHER(aes_cbc_dec_128)
+{
+	if (is_crypto_available()) {
+		switch (get_micro_arch_id()) {
+		case MICRO_ARCH_ID(ARM, NEOVERSE_N1):
+			return PROVIDER_INFO(aes_cbc_dec_128_aes_1);
+		case MICRO_ARCH_ID(ARM, CORTEX_A57):
+			return PROVIDER_INFO(aes_cbc_dec_128_aes_4);
+		case MICRO_ARCH_ID(ARM, CORTEX_A72):
+			return PROVIDER_INFO(aes_cbc_dec_128_aes_6);
+		}
+		return PROVIDER_INFO(aes_cbc_dec_128_aes_5);
+	}
+	return PROVIDER_BASIC(aes_cbc_dec_128);
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_cbc_dec_192)
+{
+	if (is_crypto_available()) {
+		switch (get_micro_arch_id()) {
+		case MICRO_ARCH_ID(ARM, NEOVERSE_N1):
+			return PROVIDER_INFO(aes_cbc_dec_192_aes_1);
+		case MICRO_ARCH_ID(ARM, CORTEX_A57):
+			return PROVIDER_INFO(aes_cbc_dec_192_aes_5);
+		case MICRO_ARCH_ID(ARM, CORTEX_A72):
+			return PROVIDER_INFO(aes_cbc_dec_192_aes_4);
+		}
+		return PROVIDER_INFO(aes_cbc_dec_192_aes_5);
+	}
+	return PROVIDER_BASIC(aes_cbc_dec_192);
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_cbc_dec_256)
+{
+	if (is_crypto_available()) {
+		switch (get_micro_arch_id()) {
+		case MICRO_ARCH_ID(ARM, NEOVERSE_N1):
+			return PROVIDER_INFO(aes_cbc_dec_256_aes_1);
+		case MICRO_ARCH_ID(ARM, CORTEX_A57):
+			return PROVIDER_INFO(aes_cbc_dec_256_aes_5);
+		case MICRO_ARCH_ID(ARM, CORTEX_A72):
+			return PROVIDER_INFO(aes_cbc_dec_256_aes_6);
+		}
+		return PROVIDER_INFO(aes_cbc_dec_256_aes_5);
+	}
+	return PROVIDER_BASIC(aes_cbc_dec_256);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_common.S
new file mode 100644
index 000000000..6f793843a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_common.S
@@ -0,0 +1,54 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#define FN_NAME(fn,mode,post) aes_cbc_##fn##_##mode##_##post
+#define LABEL_NAME(fn,mode,post) .L##fn##_##mode##_##post
+#define START_FUNC(fn,mode,post) .global FN_NAME(fn,mode,post); \
+    .type   FN_NAME(fn,mode,post), %function; \
+    FN_NAME(fn,mode,post):
+#define END_FUNC(fn,mode,post)   .size  FN_NAME(fn,mode,post), .-FN_NAME(fn,mode,post)
+.macro  declare_var_vector_reg name:req,reg:req
+.ifdef q\name
+    .unreq  q\name
+    .unreq  v\name
+    .unreq  s\name
+    .unreq  d\name
+.endif
+    .set q\name , \reg
+    q\name      .req    q\reg
+    v\name      .req    v\reg
+    s\name      .req    s\reg
+    d\name      .req    d\reg
+.endm
+
+.macro  declare_var_generic_reg name:req,reg:req
+     \name      .req    x\reg
+    x\name      .req    x\reg
+    w\name      .req    w\reg
+.endm
+\ No newline at end of file
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_dec_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_dec_aes.S
new file mode 100644
index 000000000..11bd90a71
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_dec_aes.S
@@ -0,0 +1,482 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+    .arch armv8-a+crypto
+    .text
+#include "cbc_common.S"
+    .altmacro
+.macro _aes_decrypt_round    block:req,key:req
+    aesd    v\block\().16b,vKey\key\().16b
+    .if \key < 13
+        aesimc   v\block\().16b,v\block\().16b
+    .endif
+    .if \key > 13
+        .error "erro her"
+    .endif
+.endm
+
+.macro  aes_decrypt_round   block,reg,key
+    _aes_decrypt_round  In\reg\()_\block,\key
+.endm
+
+.macro  load_keys   first_key
+    .if \first_key == 4
+        ld1     {vKey4.4s -vKey6.4s},[keys],3*16
+    .endif
+    .ifc 2 , \first_key
+        ldr     qKey2,[keys],1*16
+        ld1     {vKey3.16b -vKey6.16b},[keys],4*16
+    .endif
+    .ifc 0 , \first_key
+        ld1     {vKey0.16b -vKey2.16b},[keys],3*16
+        ld1     {vKey3.16b -vKey6.16b},[keys],4*16
+    .endif
+    ld1     {vKey7.16b -vKey10.16b},[keys],4*16
+    ld1     {vKey11.16b-vKey14.16b},[keys],4*16
+.endm
+
+.macro  aes_decrypt_blocks_round    blocks,key_idx,key_reg,next_keyreg,first_idx
+    .if \key_idx == 12
+        ldr q\next_keyreg,[keys],(\first_idx-13)*16
+    .else
+        ldr q\next_keyreg,[keys],16
+    .endif
+    n=0
+    .rept   \blocks
+        _aes_decrypt_round  %n,\key_reg
+        n=n+1
+    .endr
+.endm
+
+.macro  aes_decrypt_rounds   blocks,key_st,key_end,first_idx
+    j=key_st
+    .rept   \key_end - \key_st + 1
+        aes_decrypt_blocks_round    \blocks,%j,%(j%2),%((j+1)%2),\first_idx
+        j=j+1
+    .endr
+.endm
+
+.macro  aes_cbc_decrypt_rounds  blocks,first_idx,reg,next_reg
+    aes_decrypt_rounds  \blocks,\first_idx,12,\first_idx
+.endm
+
+.macro  declare_prefix idx,reg,prefix
+    declare_var_vector_reg  \prefix\()\idx,\reg
+.endm
+
+.macro  mldr    reg,block,addr
+    ldr qIn\reg\()_\block,[\addr],16
+.endm
+
+.macro  mldrin    reg,blocks,addr
+    .if \blocks == 1
+        ldr qIn\reg\()_0,[\addr],16
+        .exitm
+    .endif
+    .if \blocks == 2
+        ldp qIn\reg\()_0,qIn\reg\()_1,[\addr],2*16
+        .exitm
+    .endif
+    .if \blocks == 3
+        ldr qIn\reg\()_0,[\addr],16
+        ldp qIn\reg\()_1,qIn\reg\()_2,[\addr],2*16
+        .exitm
+    .endif
+    .if \blocks == 4
+        ld1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16
+        .exitm
+    .endif
+    .if \blocks == 5
+        ldr qIn\reg\()_0,[\addr],16
+        ld1 {vIn\reg\()_1.16b-vIn\reg\()_4.16b},[\addr],4*16
+        .exitm
+    .endif
+    .if \blocks == 6
+        ldp qIn\reg\()_0,qIn\reg\()_1,[\addr],2*16
+        ld1 {vIn\reg\()_2.16b-vIn\reg\()_5.16b},[\addr],4*16
+        .exitm
+    .endif
+    .if \blocks == 7
+        ld1 {vIn\reg\()_0.16b-vIn\reg\()_2.16b},[\addr],3*16
+        ld1 {vIn\reg\()_3.16b-vIn\reg\()_6.16b},[\addr],4*16
+        .exitm
+    .endif
+
+    .if \blocks == 8
+        ld1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16
+        ld1 {vIn\reg\()_4.16b-vIn\reg\()_7.16b},[\addr],4*16
+        .exitm
+    .endif
+    .if \blocks == 9
+        ld1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16
+        ld1 {vIn\reg\()_4.16b-vIn\reg\()_7.16b},[\addr],4*16
+        ldr qIn\reg\()_8,[\addr],16
+        .exitm
+    .endif
+.endm
+
+.macro  mstrout    reg,blocks,addr
+    .if \blocks == 1
+        str qIn\reg\()_0,[\addr],16
+        .exitm
+    .endif
+    .if \blocks == 2
+        stp qIn\reg\()_0,qIn\reg\()_1,[\addr],2*16
+        .exitm
+    .endif
+    .if \blocks == 3
+        str qIn\reg\()_0,[\addr],16
+        stp qIn\reg\()_1,qIn\reg\()_2,[\addr],2*16
+        .exitm
+    .endif
+    .if \blocks == 4
+        st1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16
+        .exitm
+    .endif
+    .if \blocks == 5
+        str qIn\reg\()_0,[\addr],16
+        st1 {vIn\reg\()_1.16b-vIn\reg\()_4.16b},[\addr],4*16
+        .exitm
+    .endif
+    .if \blocks == 6
+        stp qIn\reg\()_0,qIn\reg\()_1,[\addr],2*16
+        st1 {vIn\reg\()_2.16b-vIn\reg\()_5.16b},[\addr],4*16
+        .exitm
+    .endif
+    .if \blocks == 7
+        st1 {vIn\reg\()_0.16b-vIn\reg\()_2.16b},[\addr],3*16
+        st1 {vIn\reg\()_3.16b-vIn\reg\()_6.16b},[\addr],4*16
+        .exitm
+    .endif
+
+    .if \blocks == 8
+        st1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16
+        st1 {vIn\reg\()_4.16b-vIn\reg\()_7.16b},[\addr],4*16
+        .exitm
+    .endif
+    .if \blocks == 9
+        st1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16
+        st1 {vIn\reg\()_4.16b-vIn\reg\()_7.16b},[\addr],4*16
+        str qIn\reg\()_8,[\addr],16
+        .exitm
+    .endif
+.endm
+
+.macro  eorkey14    block,reg
+    eor     vBlock\block\().16b,vKey14.16b,vState\reg\()_\block\().16b
+.endm
+
+.macro  eorblock    block,reg
+    eor     vIn\reg\()_\block\().16b,vBlock\block\().16b,vIn\reg\()_\block\().16b
+.endm
+
+.macro  movstate0   block,reg
+    mov     vState\reg\()_0.16b,vIn\reg\()_\block\().16b
+.endm
+
+.macro cbc_decrypt_rounds   blocks,reg,first_key,cur_blocks
+    .ifb    \cur_blocks
+        _blocks=\blocks
+    .else
+        _blocks=\cur_blocks
+    .endif
+    key=\first_key + 1
+    .if 3*\blocks+1 >= 32-15+\first_key
+        ldr_key %key,\first_key
+    .endif
+    n=0
+    .rept   _blocks - 1
+        eorkey14            %((n+1)%_blocks),\reg
+        aes_decrypt_round   %n,\reg,\first_key
+        n=n+1
+    .endr
+    eorkey14            0,\reg
+    movstate0           %(_blocks-1),\reg
+    aes_decrypt_round   %n,\reg,\first_key
+
+    k=0
+    .rept   15-\first_key-3
+        n=0
+        .if 3*\blocks+1 >= 32-15+\first_key
+            ldr_key %(key+k+1),\first_key
+        .endif
+
+        .rept _blocks
+            aes_decrypt_round   %n,\reg,%(key+k)
+            n=n+1
+        .endr
+        k=k+1
+    .endr
+    n=0
+    .if 3*\blocks+1 >= 32-15+\first_key
+        ldr_key \first_key,\first_key
+    .endif
+    .rept _blocks
+        aes_decrypt_round   %n,\reg,13
+        eorblock            %n,\reg
+        n=n+1
+    .endr
+.endm
+
+.macro  print_macro a,b,c,d,e
+    .print "print_macro,\a \b \c \d \e"
+.endm
+
+.macro  remainder_process   blocks,first_key,curblk
+.if \blocks >   (1<<\curblk)
+        tbz                 xlen_remainder,\curblk,1f
+        mldrin              0,%(1<<\curblk),in
+        cbc_decrypt_rounds  \blocks,0,\first_key,%(1<<\curblk)
+        mstrout             0,%(1<<\curblk),out
+1:
+.endif
+.endm
+
+.macro  aes_cbc_decrypt_blocks  first_key,blocks
+    division    \blocks, len_bytes,len_remainder,tmp0,tmp1
+    mov         xlen_quotient_in,xlen_quotient
+    /*
+    input regs(2*\block) + tmp regs(\blocks) + State reg(1)
+       + key regs(15-\first_key) < 32
+    */
+    .if 3*\blocks+1 < 32-15+\first_key
+        n=\first_key
+        .rept   15-\first_key
+            declare_prefix  %n,%(n+17),Key
+            n=n+1
+        .endr
+        load_keys   \first_key
+    .else
+        n=\first_key
+        .rept   14-\first_key
+            declare_prefix  %n,%((n%2)+29),Key
+            n=n+1
+        .endr
+        declare_prefix  14,31,Key
+        /* load first key */
+        ldr_key \first_key,\first_key
+        /* load last key */
+        ldr_key 14,\first_key
+    .endif
+    m=\blocks
+    l=\blocks-1
+    declare_prefix  0,0,State0_
+    declare_prefix  0,0,State1_
+    n=0
+    .rept   \blocks
+        declare_prefix  %n,%(n+1),In0_
+        declare_prefix  %n,%(n+m+1),In1_
+        declare_prefix  %n,%(n+2*m+1),Block
+        n=n+1
+    .endr
+    n=1
+    .rept   \blocks -1
+        declare_prefix  %n,%(n),State0_
+        declare_prefix  %n,%(n+m),State1_
+        n=n+1
+    .endr
+    ldr     qState0_0,[IV]
+    cbz     xlen_quotient,9f
+    mldrin  0,\blocks,in
+    sub                 xlen_quotient_in,xlen_quotient_in,1
+    b       5f
+
+3:
+    sub                 xlen_quotient,xlen_quotient,1
+    mstrout             1,\blocks,out
+    cbz                 xlen_quotient,9f
+5:
+    cbz                 xlen_quotient_in,1f
+    mldrin              1,\blocks,in
+    sub                 xlen_quotient_in,xlen_quotient_in,1
+1:
+    cbc_decrypt_rounds  \blocks,0,\first_key
+    sub                 xlen_quotient,xlen_quotient,1
+    mstrout             0,\blocks,out
+    cbz                 xlen_quotient,9f
+
+    cbz                 xlen_quotient_in,1f
+    mldrin              0,\blocks,in
+    sub                 xlen_quotient_in,xlen_quotient_in,1
+1:
+    cbc_decrypt_rounds  \blocks,1,\first_key
+    b       3b
+9:
+    remainder_process   \blocks,\first_key,3
+    remainder_process   \blocks,\first_key,2
+    remainder_process   \blocks,\first_key,1
+    remainder_process   \blocks,\first_key,0
+.endm
+
+
+.macro division     blocks,quotient,remainder,tmp0,tmp1
+    .if \blocks == 1
+            mov     x\remainder, 0
+            .exitm
+    .endif
+    .if \blocks == 2
+            and     x\remainder, x\quotient, 1
+            lsr     x\quotient, x\quotient, 1
+            .exitm
+    .endif
+    .if \blocks == 3
+            mov     x\tmp0, -6148914691236517206
+            mov     x\remainder, x\quotient
+            movk    x\tmp0, 0xaaab, lsl 0
+            umulh   x\tmp0, x\quotient, x\tmp0
+            and     x\tmp1, x\tmp0, -2
+            lsr     x\quotient, x\tmp0, 1
+            add     x\tmp1, x\tmp1, x\quotient
+            sub     x\remainder, x\remainder, x\tmp1
+            .exitm
+    .endif
+    .if \blocks == 4
+            and     x\remainder, x\quotient, 3
+            lsr     x\quotient, x\quotient, 2
+            .exitm
+    .endif
+    .if \blocks == 5
+            mov     x\tmp0, -3689348814741910324
+            mov     x\remainder, x\quotient
+            movk    x\tmp0, 0xcccd, lsl 0
+            umulh   x\tmp0, x\quotient, x\tmp0
+            and     x\tmp1, x\tmp0, -4
+            lsr     x\quotient, x\tmp0, 2
+            add     x\tmp1, x\tmp1, x\quotient
+            sub     x\remainder, x\remainder, x\tmp1
+            .exitm
+    .endif
+    .if \blocks == 6
+            mov     x\tmp0, -6148914691236517206
+            mov     x\tmp1, x\quotient
+            movk    x\tmp0, 0xaaab, lsl 0
+            umulh   x\tmp0, x\quotient, x\tmp0
+            lsr     x\quotient, x\tmp0, 2
+            add     x\remainder, x\quotient, x\quotient, lsl 1
+            sub     x\remainder, x\tmp1, x\remainder, lsl 1
+            .exitm
+    .endif
+    .if \blocks == 7
+            mov     x\tmp0, 9363
+            mov     x\tmp1, x\quotient
+            movk    x\tmp0, 0x9249, lsl 16
+            movk    x\tmp0, 0x4924, lsl 32
+            movk    x\tmp0, 0x2492, lsl 48
+            umulh   x\quotient, x\quotient, x\tmp0
+            sub     x\tmp0, x\tmp1, x\quotient
+            add     x\tmp0, x\quotient, x\tmp0, lsr 1
+            lsr     x\quotient, x\tmp0, 2
+            lsl     x\remainder, x\quotient, 3
+            sub     x\remainder, x\remainder, x\quotient
+            sub     x\remainder, x\tmp1, x\remainder
+            .exitm
+    .endif
+    .if \blocks == 8
+            and     x\remainder, x\quotient, 7
+            lsr     x\quotient, x\quotient, 3
+            .exitm
+    .endif
+    .if \blocks == 9
+            mov     x\tmp0, 58255
+            mov     x\remainder, x\quotient
+            movk    x\tmp0, 0x8e38, lsl 16
+            movk    x\tmp0, 0x38e3, lsl 32
+            movk    x\tmp0, 0xe38e, lsl 48
+            umulh   x\tmp0, x\quotient, x\tmp0
+            and     x\tmp1, x\tmp0, -8
+            lsr     x\quotient, x\tmp0, 3
+            add     x\tmp1, x\tmp1, x\quotient
+            sub     x\remainder, x\remainder, x\tmp1
+            .exitm
+    .endif
+.endm
+
+.macro  ldr_key    num,first_key
+    ldr     qKey\num,[keys,16*(\num - \first_key)]
+.endm
+#ifndef CBC_DECRYPT_BLOCKS_NUM
+#define CBC_DECRYPT_BLOCKS_NUM 8
+#endif
+
+.macro  cbc_decrypt     first_key:req,blocks
+    lsr     xlen_bytes,xlen_bytes,4
+    cbz     xlen_bytes,10f
+    push_stack
+    aes_cbc_decrypt_blocks \first_key,\blocks
+    pop_stack
+10:
+.endm
+
+.set        stack_size,64
+.macro  push_stack
+    stp      d8, d9,[sp,-stack_size]!
+    stp     d10,d11,[sp,16]
+    stp     d12,d13,[sp,32]
+    stp     d14,d15,[sp,48]
+.endm
+
+.macro  pop_stack
+    ldp     d10,d11,[sp,16]
+    ldp     d12,d13,[sp,32]
+    ldp     d14,d15,[sp,48]
+    ldp     d8, d9, [sp], stack_size
+.endm
+
+/*
+void aes_cbc_dec_128(
+	void     *in,        //!< Input cipher text
+	uint8_t  *IV,        //!< Must be 16 bytes aligned to a 16 byte boundary
+	uint8_t  *keys,      //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data
+	void     *out,       //!< Output plain text
+	uint64_t len_bytes   //!< Must be a multiple of 16 bytes
+	);
+*/
+    declare_var_generic_reg in          ,0
+    declare_var_generic_reg IV          ,1
+    declare_var_generic_reg keys        ,2
+    declare_var_generic_reg out         ,3
+    declare_var_generic_reg len_bytes   ,4
+    declare_var_generic_reg len_quotient,4
+    declare_var_generic_reg len_remainder,5
+    declare_var_generic_reg tmp0        ,6
+    declare_var_generic_reg tmp1        ,7
+    declare_var_generic_reg len_quotient_in,6
+
+.macro  define_aes_cbc_dec_func mode:req,blocks:req
+    .global aes_cbc_dec_\mode\()_aes_\blocks
+aes_cbc_dec_\mode\()_aes_\blocks:
+    cbc_decrypt %((256-mode)/32),\blocks
+    ret
+    .size   aes_cbc_dec_\mode\()_aes_\blocks, . - aes_cbc_dec_\mode\()_aes_\blocks
+.endm
+
+.irp    blocks,1,2,3,4,5,6,7,8,9
+    define_aes_cbc_dec_func 128,\blocks
+    define_aes_cbc_dec_func 192,\blocks
+    define_aes_cbc_dec_func 256,\blocks
+.endr
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_enc_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_enc_aes.S
new file mode 100644
index 000000000..8eb5e507d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_enc_aes.S
@@ -0,0 +1,157 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+    .arch armv8-a+crypto
+    .text
+
+#include "cbc_common.S"
+
+        declare_var_vector_reg  Key0 ,17
+        declare_var_vector_reg  Key1 ,18
+        declare_var_vector_reg  Key2 ,19
+        declare_var_vector_reg  Key3 ,20
+        declare_var_vector_reg  Key4 ,21
+        declare_var_vector_reg  Key5 ,22
+        declare_var_vector_reg  Key6 ,23
+        declare_var_vector_reg  Key7 ,24
+        declare_var_vector_reg  Key8 ,25
+        declare_var_vector_reg  Key9 ,26
+        declare_var_vector_reg  Key10 ,27
+        declare_var_vector_reg  Key11 ,28
+        declare_var_vector_reg  Key12 ,29
+        declare_var_vector_reg  Key13 ,30
+        declare_var_vector_reg  Key14 ,31
+
+.macro aes_encrypt_round    block,key
+    aese    v\block\().16b,vKey\key\().16b
+    .if \key < 13
+        aesmc   v\block\().16b,v\block\().16b
+    .endif
+.endm
+
+.macro aes_encrypt_round_name    block,key
+    aese    v\block\().16b,v\key\().16b
+    aesmc   v\block\().16b,v\block\().16b
+.endm
+
+
+
+.set        stack_size,64
+.macro  push_stack
+    stp      d8, d9,[sp,-stack_size]!
+    stp     d10,d11,[sp,16]
+    stp     d12,d13,[sp,32]
+    stp     d14,d15,[sp,48]
+.endm
+
+.macro  pop_stack
+    ldp     d10,d11,[sp,16]
+    ldp     d12,d13,[sp,32]
+    ldp     d14,d15,[sp,48]
+    ldp     d8, d9, [sp], stack_size
+.endm
+/*
+void aes_cbc_dec_128(
+	void     *in,        //!< Input cipher text
+	uint8_t  *IV,        //!< Must be 16 bytes aligned to a 16 byte boundary
+	uint8_t  *keys,      //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data
+	void     *out,       //!< Output plain text
+	uint64_t len_bytes   //!< Must be a multiple of 16 bytes
+	);
+*/
+    declare_var_generic_reg in          ,0
+    declare_var_generic_reg IV          ,1
+    declare_var_generic_reg keys        ,2
+    declare_var_generic_reg out         ,3
+    declare_var_generic_reg len_bytes   ,4
+
+    declare_var_vector_reg  State       ,0
+    declare_var_vector_reg  FirstKey    ,1
+    declare_var_vector_reg  Block       ,2
+    declare_var_vector_reg  ConstKey    ,3
+.macro  load_key    num
+    ldr     qKey\num,[keys],16
+.endm
+.altmacro
+.macro  cbc_encrypt     first:req
+    lsr     xlen_bytes,xlen_bytes,4
+    cbz     xlen_bytes,3f
+    ldr     qState,[IV]
+    ldr     qKey\first,[keys],16
+    .set    lastkey_off,13-\first
+    ldr     qKey14,[keys,lastkey_off*16]
+    ldr     qBlock,[in],16
+    n=\first
+    second=1+\first
+    .rept   5-n
+        n=n+1
+        load_key    %n
+    .endr
+    ld1     {vKey6.4s - vKey9.4s},[keys],4*16
+    eor     vBlock.16b,vBlock.16b ,vState.16b
+    eor     vConstKey.16b,vKey\first\().16b,vKey14.16b
+    aes_encrypt_round   Block,\first
+    ld1         {vKey10.4s - vKey13.4s},[keys]
+    b           1f
+2:
+    aes_encrypt_round   Block,\first
+    str                 qState,[out],16
+1:
+    sub                 xlen_bytes,xlen_bytes,1
+    aes_encrypt_round   Block,%second
+    cbz                 xlen_bytes,1f
+    ldr                 qKey\first,[in],16
+1:
+    n=second
+    .rept   12-n
+        n=n+1
+        aes_encrypt_round    Block,%n
+    .endr
+
+    eor         vKey\first\().16b,vKey\first\().16b,vConstKey.16b
+    aes_encrypt_round   Block,13
+    eor         vState.16b,vBlock.16b,vKey14.16b
+    cbnz        xlen_bytes,2b
+    str         qState,[out]
+3:
+
+.endm
+START_FUNC(enc,128,aes)
+    cbc_encrypt 4
+    ret
+END_FUNC(enc,128,aes)
+
+START_FUNC(enc,192,aes)
+    cbc_encrypt 2
+    ret
+END_FUNC(enc,192,aes)
+
+START_FUNC(enc,256,aes)
+    cbc_encrypt 0
+    ret
+END_FUNC(enc,256,aes)
+\ No newline at end of file
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_multibinary_aarch64.S
new file mode 100644
index 000000000..fba533754
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_multibinary_aarch64.S
@@ -0,0 +1,38 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aarch64_multibinary.h"
+
+mbin_interface     aes_cbc_dec_128
+mbin_interface     aes_cbc_dec_192
+mbin_interface     aes_cbc_dec_256
+
+mbin_interface     aes_cbc_enc_128
+mbin_interface     aes_cbc_enc_192
+mbin_interface     aes_cbc_enc_256
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_aarch64_dispatcher.c
new file mode 100644
index 000000000..f8188e3ae
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_aarch64_dispatcher.c
@@ -0,0 +1,255 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+#undef PROVIDER_BASIC
+#define PROVIDER_BASIC(a) (void*)0
+
+static unsigned long is_crypto_available(void)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	return (auxval & (HWCAP_ASIMD | HWCAP_AES | HWCAP_PMULL)) ==
+	    (HWCAP_ASIMD | HWCAP_AES | HWCAP_PMULL);
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_enc_128_aes);
+
+	return PROVIDER_BASIC(aes_gcm_enc_128);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_dec_128_aes);
+
+	return PROVIDER_BASIC(aes_gcm_dec_128);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_precomp_128)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_precomp_128_aes);
+
+	return PROVIDER_BASIC(aes_gcm_precomp_128);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_enc_256_aes);
+
+	return PROVIDER_BASIC(aes_gcm_enc_256);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_dec_256_aes);
+
+	return PROVIDER_BASIC(aes_gcm_dec_256);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_precomp_256)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_precomp_256_aes);
+
+	return PROVIDER_BASIC(aes_gcm_precomp_256);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128_update)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_enc_128_update_aes);
+
+	return PROVIDER_BASIC(aes_gcm_enc_128_update);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128_finalize)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_enc_128_finalize_aes);
+
+	return PROVIDER_BASIC(aes_gcm_enc_128_finalize);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128_update)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_dec_128_update_aes);
+
+	return PROVIDER_BASIC(aes_gcm_dec_128_update);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128_finalize)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_dec_128_finalize_aes);
+
+	return PROVIDER_BASIC(aes_gcm_dec_128_finalize);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256_update)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_enc_256_update_aes);
+
+	return PROVIDER_BASIC(aes_gcm_enc_256_update);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256_finalize)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_enc_256_finalize_aes);
+
+	return PROVIDER_BASIC(aes_gcm_enc_256_finalize);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256_update)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_dec_256_update_aes);
+
+	return PROVIDER_BASIC(aes_gcm_dec_256_update);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256_finalize)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_dec_256_finalize_aes);
+
+	return PROVIDER_BASIC(aes_gcm_dec_256_finalize);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_init_256)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_init_256_aes);
+
+	return PROVIDER_BASIC(aes_gcm_init_256);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_init_128)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_init_128_aes);
+
+	return PROVIDER_BASIC(aes_gcm_init_128);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128_nt)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_enc_128_nt_aes);
+
+	return PROVIDER_BASIC(aes_gcm_enc_128_nt);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128_update_nt)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_enc_128_update_nt_aes);
+
+	return PROVIDER_BASIC(aes_gcm_enc_128_update_nt);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128_nt)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_dec_128_nt_aes);
+
+	return PROVIDER_BASIC(aes_gcm_dec_128_nt);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128_update_nt)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_dec_128_update_nt_aes);
+
+	return PROVIDER_BASIC(aes_gcm_dec_128_update_nt);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256_nt)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_enc_256_nt_aes);
+
+	return PROVIDER_BASIC(aes_gcm_enc_256_nt);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256_update_nt)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_enc_256_update_nt_aes);
+
+	return PROVIDER_BASIC(aes_gcm_enc_256_update_nt);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256_nt)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_dec_256_nt_aes);
+
+	return PROVIDER_BASIC(aes_gcm_dec_256_nt);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256_update_nt)
+{
+	if (is_crypto_available())
+		return PROVIDER_INFO(aes_gcm_dec_256_update_nt_aes);
+
+	return PROVIDER_BASIC(aes_gcm_dec_256_update_nt);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common.S
new file mode 100644
index 000000000..042f6cf19
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common.S
@@ -0,0 +1,430 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+    .arch armv8-a+crypto
+    .text
+#define HASHKEY_TOTAL_NUM       (24)
+#define HASHKEY_BASE_OFF        (15*16)
+#define HASHKEY_OFF(n)          ((15*16)+n*32)
+#define HASHKEY_EXT_OFF(n)      ((15*16)+n*32+16)
+#ifndef KEY_LEN
+#define KEY_LEN 128
+#endif
+#ifndef BLOCKS
+#define BLOCKS 24
+#endif
+#define FN_NAME(fn,mode,post) aes_gcm_##fn##_##mode####post##aes
+#define START_FUNC(fn,mode,post) .global FN_NAME(fn,mode,post); \
+    .type   FN_NAME(fn,mode,post), %function; \
+    FN_NAME(fn,mode,post):
+#define END_FUNC(fn,mode,post)   .size  FN_NAME(fn,mode,post), .-FN_NAME(fn,mode,post)
+
+#define AAD_LEN_OFF                 16
+#define IN_LENGTH_OFF               24
+#define PARTIAL_BLOCK_ENC_KEY_OFF   32
+#define PARTIAL_BLOCK_LENGTH_OFF    80
+#define CTR_OFF                     64
+#define ORIG_IV_OFF                 48
+/*
+    [low,middle,tmp0,high] +=dat0 * [hashkey0,hashkey0_ext]
+    ifnb dat1
+        dat1=rbit(*dat_adr)
+        [hashkey0,hashkey0_ext] = *hashkey_adr
+        dat_adr+=16
+        hashkey_adr+=32
+*/
+
+.macro  ghash_mult_round    aadhash:req,dat_adr:req,hashkey_adr:req,   \
+    hashkey0:req,hashkey0_ext:req,high:req,low:req,middle:req,         \
+    tmp0:req,tmp1:req,next_dat:req,left_count:req
+
+    ldr      q\next_dat,[\dat_adr],16
+    pmull    v\tmp0\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d
+    pmull2   v\tmp1\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d
+    .if  \left_count > 1
+        ldr      q\hashkey0_ext,[\hashkey_adr,16]
+    .endif
+    eor      v\middle\().16b,v\middle\().16b,v\tmp0\().16b
+    pmull2   v\tmp0\().1q,v\aadhash\().2d,v\hashkey0\().2d
+    eor      v\middle\().16b,v\middle\().16b,v\tmp1\().16b
+    pmull    v\tmp1\().1q,v\aadhash\().1d,v\hashkey0\().1d
+    .if  \left_count > 1
+        ldr      q\hashkey0,[\hashkey_adr],32
+    .endif
+    eor      v\high\().16b,v\high\().16b,v\tmp0\().16b
+    eor      v\low\().16b,v\low\().16b,v\tmp1\().16b
+    rbit     v\aadhash\().16b, v\next_dat\().16b
+.endm
+
+.macro  ghash_mult_init_round    aadhash:req,dat_adr:req,hashkey_adr:req,   \
+    hashkey0:req,hashkey0_ext:req,                                          \
+    high:req,low:req,middle:req,tmp0:req,next_dat:req,left_count:req
+    ldp     q\hashkey0,q\hashkey0_ext,[\hashkey_adr],32
+    ldr      q\next_dat,[\dat_adr],16
+    pmull    v\middle\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d
+    pmull2   v\tmp0\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d
+    .if  \left_count > 1
+        ldr      q\hashkey0_ext,[\hashkey_adr,16]
+    .endif
+    pmull2   v\high\().1q,v\aadhash\().2d,v\hashkey0\().2d
+    eor      v\middle\().16b,v\middle\().16b,v\tmp0\().16b
+
+    pmull    v\low\().1q,v\aadhash\().1d,v\hashkey0\().1d
+    .if  \left_count > 1
+        ldr      q\hashkey0,[\hashkey_adr],32
+    .endif
+    rbit     v\aadhash\().16b, v\next_dat\().16b
+.endm
+
+/* aadhash=reduction(low,middle,high)+dat0 */
+.macro  ghash_mult_final_round aadhash:req,     \
+    high:req,low:req,middle:req,tmp0:req,       \
+    zero:req,poly:req
+
+    ext      v\tmp0\().16b,v\middle\().16b,v\zero\().16b,8      /*high*/
+    ext      v\middle\().16b,v\zero\().16b,v\middle\().16b,8    /*low */
+    eor      v\high\().16b,v\high\().16b,v\tmp0\().16b
+    eor      v\low\().16b,v\low\().16b,v\middle\().16b
+
+    pmull2   v\middle\().1q,v\high\().2d,v\poly\().2d
+
+    ext      v\tmp0\().16b,v\middle\().16b,v\zero\().16b,8      /*high*/
+    ext      v\middle\().16b,v\zero\().16b,v\middle\().16b,8    /*low*/
+    eor      v\high\().16b,v\high\().16b,v\tmp0\().16b
+    eor      v\low\().16b,v\low\().16b,v\middle\().16b
+    pmull    v\middle\().1q,v\high\().1d,v\poly\().1d
+    eor      v\tmp0\().16b, v\low\().16b, v\middle\().16b
+    eor      v\aadhash\().16b, v\aadhash\().16b, v\tmp0\().16b
+.endm
+.macro  ghash_reset_hashkey_addr    hashkey_addr:req,hashkey_base:req,count:req
+    add     \hashkey_addr,\hashkey_base,(24-\count)<<5
+.endm
+
+
+.macro ghash_block_n count:req,aadhash:req, dat:req,dat_addr:req, hashkey_addr:req, hashkey_base:req, \
+    hashkey:req,hashkey_ext:req,high:req,low:req,middle:req, zero:req,poly:req, \
+    tmp0:req,tmp1:req
+
+    ghash_reset_hashkey_addr    \hashkey_addr,\hashkey_base,\count
+    ghash_mult_init_round   \aadhash,\dat_addr,\hashkey_addr,\hashkey,\hashkey_ext, \
+        \high,\low,\middle,\tmp0,\dat,\count
+    .set left_count,\count - 1
+    .rept left_count
+        ghash_mult_round        \aadhash,\dat_addr,\hashkey_addr,\hashkey,\hashkey_ext, \
+            \high,\low,\middle,\tmp0,\tmp1,\dat, left_count
+        .set left_count,left_count - 1
+
+    .endr
+    ghash_mult_final_round  \aadhash,\high,\low,\middle,\tmp0,\zero,\poly
+.endm
+
+/*
+    aadhash=aadhash*[hashkey,hashkey_ext] + rbit(dat)
+*/
+.macro ghash_block_reg aadhash:req, dat:req, \
+    hashkey:req,hashkey_ext:req,high:req,low:req,middle:req, zero:req,poly:req, \
+    tmp0:req
+    pmull    v\middle\().1q,v\aadhash\().1d,v\hashkey_ext\().1d
+    pmull2   v\tmp0\().1q,v\aadhash\().2d,v\hashkey_ext\().2d
+    pmull2   v\high\().1q,v\aadhash\().2d,v\hashkey\().2d
+    eor      v\middle\().16b,v\middle\().16b,v\tmp0\().16b
+    pmull    v\low\().1q,v\aadhash\().1d,v\hashkey\().1d
+    rbit     v\aadhash\().16b, v\dat\().16b
+    ghash_mult_final_round  \aadhash,\high,\low,\middle,\tmp0,\zero,\poly
+.endm
+
+.macro  ghash_mult_round_noload    aadhash:req,   \
+    hashkey0:req,hashkey0_ext:req,high:req,low:req,middle:req,         \
+    tmp0:req,tmp1:req
+
+    pmull    v\tmp0\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d
+    pmull2   v\tmp1\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d
+    eor      v\middle\().16b,v\middle\().16b,v\tmp0\().16b
+    pmull2   v\tmp0\().1q,v\aadhash\().2d,v\hashkey0\().2d
+    eor      v\middle\().16b,v\middle\().16b,v\tmp1\().16b
+    pmull    v\tmp1\().1q,v\aadhash\().1d,v\hashkey0\().1d
+    eor      v\high\().16b,v\high\().16b,v\tmp0\().16b
+    eor      v\low\().16b,v\low\().16b,v\tmp1\().16b
+
+.endm
+
+/* aadhash=reduction([low,high],poly)+dat0 */
+.macro  poly_mult_final_x2 aadhash:req, \
+    high:req,low:req,tmp0:req,tmp1:req,      \
+    poly:req
+    pmull2   v\tmp1\().1q,v\high\().2d,v\poly\().2d
+    eor      v\low\().16b, v\aadhash\().16b, v\low\().16b
+    eor      v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b
+    ext      v\tmp0\().16b,v\tmp1\().16b,v\aadhash\().16b,8 //high
+    ext      v\tmp1\().16b,v\aadhash\().16b,v\tmp1\().16b,8 //low
+    eor      v\high\().16b,v\high\().16b,v\tmp0\().16b
+    eor      v\low\().16b,v\low\().16b,v\tmp1\().16b
+    pmull    v\tmp1\().1q,v\high\().1d,v\poly\().1d
+    eor      v\aadhash\().16b, v\low\().16b, v\tmp1\().16b
+.endm
+
+.macro aes_encrypt_round    block,key
+    aese    v\block\().16b,v\key\().16b
+    aesmc   v\block\().16b,v\block\().16b
+.endm
+
+.macro  declare_var_vector_reg name:req,reg:req
+    q\name      .req    q\reg
+    v\name      .req    v\reg
+    s\name      .req    s\reg
+    d\name      .req    d\reg
+.endm
+
+.macro  declare_var_generic_reg name:req,reg:req
+     \name      .req    x\reg
+    x\name      .req    x\reg
+    w\name      .req    w\reg
+.endm
+
+/*Read data less than 16 */
+.macro read_small_data  dest:req,src:req,size:req,tbl_adr:req,tbl:req
+    ldr     q\tbl,[\tbl_adr,\size,lsl 4]
+    tbz     \size,3,1f
+    ld1     {v\dest\().d}[0],[\src],8
+1:
+    tbz     \size,2,1f
+    ld1     {v\dest\().s}[2],[\src],4
+1:
+    tbz     \size,1,1f
+    ld1     {v\dest\().h}[6],[\src],2
+1:
+    tbz     \size,0,1f
+    ld1     {v\dest\().b}[14],[\src],1
+1:
+    tbl     v\dest\().16b,{v\dest\().16b},v\tbl\().16b
+.endm
+.macro read_small_data_start  dest:req,src:req,size:req,tbl_adr:req,tbl:req
+    adrp    \tbl_adr,:got:read_small_data_table
+    ldr     \tbl_adr,[\tbl_adr,#:got_lo12:read_small_data_table]
+    read_small_data \dest,\src,\size,\tbl_adr,\tbl
+.endm
+
+.macro read_small_data_end  dest:req,src:req,size:req,tbl_adr:req,tbl:req
+    adrp    \tbl_adr,:got:read_end_small_data_table
+    ldr     \tbl_adr,[\tbl_adr,#:got_lo12:read_end_small_data_table]
+    read_small_data \dest,\src,\size,\tbl_adr,\tbl
+.endm
+
+.macro write_small_data  src:req,dest:req,size:req,tbl_adr:req,tmp1:req
+    ldr     q\tmp1,[\tbl_adr,\size,lsl 4]
+    tbl     v\tmp1\().16b,{v\src\().16b},v\tmp1\().16b
+    tbz     \size,3,1f
+    st1     {v\tmp1\().d}[0],[\dest],8
+1:
+    tbz     \size,2,1f
+    st1     {v\tmp1\().s}[2],[\dest],4
+1:
+    tbz     \size,1,1f
+    st1     {v\tmp1\().h}[6],[\dest],2
+1:
+    tbz     \size,0,1f
+    st1     {v\tmp1\().b}[14],[\dest],1
+1:
+.endm
+.macro write_small_data_start  src:req,dest:req,size:req,tbl_adr:req,tmp1:req
+    adrp    \tbl_adr,:got:write_small_data_table
+    ldr     \tbl_adr,[\tbl_adr,#:got_lo12:write_small_data_table]
+    write_small_data \src,\dest,\size,\tbl_adr,\tmp1
+.endm
+.macro write_small_data_end  src:req,dest:req,size:req,tbl_adr:req,tmp1:req
+    adrp    \tbl_adr,:got:write_end_small_data_table
+    ldr     \tbl_adr,[\tbl_adr,#:got_lo12:write_end_small_data_table]
+    write_small_data \src,\dest,\size,\tbl_adr,\tmp1
+.endm
+
+.macro tbx_small_data_end   src:req,dest:req,size:req,tbl_adr:req,tmp1:req
+    adrp    \tbl_adr,:got:tbx_end_small_data_table
+    ldr     \tbl_adr,[\tbl_adr,#:got_lo12:tbx_end_small_data_table]
+    ldr     q\tmp1,[\tbl_adr,\size,lsl 4]
+    tbx     v\dest\().16b,{v\src\().16b},v\tmp1\().16b
+.endm
+
+.macro tbx_small_data_start   src:req,dest:req,size:req,tbl_adr:req,tmp1:req
+    adrp    \tbl_adr,:got:tbx_start_small_data_table
+    ldr     \tbl_adr,[\tbl_adr,#:got_lo12:tbx_start_small_data_table]
+    ldr     q\tmp1,[\tbl_adr,\size,lsl 4]
+    tbx     v\dest\().16b,{v\src\().16b},v\tmp1\().16b
+.endm
+
+
+.macro clear_small_data   dest:req,zero:req,size:req,tbl_adr:req,tmp1:req
+    adrp    \tbl_adr,:got:shift_small_data_table
+    ldr     \tbl_adr,[\tbl_adr,#:got_lo12:shift_small_data_table]
+    add     \tbl_adr,\tbl_adr,16
+    sub     \tbl_adr,\tbl_adr,\size
+    ldr     q\tmp1,[\tbl_adr]
+    tbx     v\dest\().16b,{v\zero\().16b},v\tmp1\().16b
+.endm
+
+
+.macro aes_gcm_n_round is_enc:req,count:req,aadhash:req, dat_addr:req,  \
+    hashkey_addr:req, hashkey_base:req,                                 \
+    hashkey:req,hashkey_ext:req,high:req,low:req, poly:req,             \
+    ctr:req,enc_ctr:req,one:req,out_adr:req,                            \
+    tmp0:req,tmp1:req
+
+    ghash_reset_hashkey_addr    \hashkey_addr,\hashkey_base,\count
+
+    aes_gcm_init    \is_enc,\aadhash,\dat_addr,\hashkey_addr,           \
+        \hashkey,\hashkey_ext, \high,\low,                              \
+        \ctr,\enc_ctr,\one,\out_adr,                                    \
+        \tmp0,\tmp1,\count
+
+    .set left_count,\count - 1
+    .rept left_count
+        aes_gcm_middle        \is_enc,\aadhash,\dat_addr,\hashkey_addr, \
+            \hashkey,\hashkey_ext, \high,\low,                          \
+            \ctr,\enc_ctr,\one,\out_adr,                                \
+            \tmp0,\tmp1, left_count
+        .set left_count,left_count - 1
+    .endr
+
+    poly_mult_final_x2  \aadhash,\high,\low,\tmp0,\tmp1,\poly
+
+.endm
+
+
+/*
+    aadhash=aadhash*[hashkey_base[(TOTAL_HASHKEY_NUM-2),(TOTAL_HASHKEY_NUM-1)]] + rbit(dat)
+*/
+.macro ghash_block_reg_x2 aadhash:req, dat:req, hashkey_base:req,       \
+    hashkey:req,high:req,low:req,tmp0:req, tmp1:req,                    \
+    tmp2:req,temp0:req
+    ldr     q\hashkey,[\hashkey_base,(TOTAL_HASHKEY_NUM-1)*32+16]
+    eor     v\tmp2\().16b,v\tmp2\().16b,v\tmp2\().16b,8 //zero
+    pmull    v\tmp1\().1q,v\aadhash\().1d,v\hashkey\().1d
+    pmull2   v\tmp0\().1q,v\aadhash\().2d,v\hashkey\().2d
+    ldr     q\hashkey,[\hashkey_base,(TOTAL_HASHKEY_NUM-1)*32]
+    eor      v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b
+    ext      v\tmp0\().16b,v\tmp0\().16b,v\tmp2\().16b,8 /*high*/
+    ext      v\tmp1\().16b,v\tmp2\().16b,v\tmp0\().16b,8 /*low*/
+    pmull2   v\high\().1q,v\aadhash\().2d,v\hashkey\().2d
+    mov      temp0,0x87
+    pmull    v\low\().1q,v\aadhash\().1d,v\hashkey\().1d
+    dup      v\tmp2\().2d,x0
+    eor      v\high\().16b,v\high\().16b,v\tmp0\().16b
+    eor      v\low\().16b,v\low\().16b,v\tmp1\().16b
+    rbit     v\aadhash\().16b, v\dat\().16b
+    poly_mult_final_x2  \aadhash,\high,\low,\tmp0,\tmp1,\tmp2
+.endm
+
+.macro __generic_load_small_data  is_enc:req,len_bit:req,small_read_len:req,  \
+        in_adr:req,out_adr:req,partial_block:req,temp0:req,temp1:req,r:req,p
+    tbz         \small_read_len,\len_bit,1f
+    ldr\p       \r\()\temp0,[\in_adr],1<<\len_bit           /*in */
+    ldr\p       \r\()\temp1,[\partial_block]                /* partial*/
+    eor         \r\()\temp1,\r\()\temp0,\r\()\temp1
+    .ifc \is_enc ,decrypt
+        str\p   \r\()\temp0,[\partial_block],1<<\len_bit
+    .endif
+    .ifc \is_enc, encrypt
+        str\p   \r\()\temp1,[\partial_block],1<<\len_bit
+    .endif
+    str\p   \r\()\temp1,[\out_adr],1<<\len_bit
+1:
+.endm
+.macro  generic_load_partial_block is_enc:req,small_read_len:req,in_adr:req,out_adr:req, \
+    partial_block:req,temp0:req,temp1:req
+    __generic_load_small_data \is_enc,3,\small_read_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1,x     /*  small_read_len >=8 */
+    __generic_load_small_data \is_enc,2,\small_read_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1,w     /*  small_read_len >=4 */
+    __generic_load_small_data \is_enc,1,\small_read_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1,w,h   /*  small_read_len >=2 */
+    __generic_load_small_data \is_enc,0,\small_read_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1,w,b   /*  small_read_len >=1 */
+.endm
+/* without Neon read version */
+.macro  generic_partial_block_start    is_enc:req,in_len:req,in_adr:req,out_adr:req,context:req,   \
+        partial_block:req,partial_block_len:req,small_read_len:req,left_partial_block_len:req,     \
+        temp0:req
+    mov         \left_partial_block_len,16
+    add         \partial_block,\context,PARTIAL_BLOCK_ENC_KEY_OFF
+    sub         \left_partial_block_len,\left_partial_block_len,\partial_block_len
+    add         \partial_block,\partial_block,\partial_block_len
+    cmp         \in_len,\left_partial_block_len
+    csel        \small_read_len,\in_len,\left_partial_block_len, ls
+    add         \partial_block_len,\partial_block_len,\small_read_len
+    sub         \in_len,\in_len,\small_read_len
+    and         \partial_block_len,\partial_block_len,0xf
+    str         \partial_block_len,[\context,PARTIAL_BLOCK_LENGTH_OFF]
+    generic_load_partial_block \is_enc,\small_read_len,\in_adr,\out_adr,\partial_block, \
+        \left_partial_block_len,\temp0     /*  small_read_len >=8 */
+.endm
+.macro  generic_paritial_block_end    is_enc:req,in_len:req,in_adr:req,out_adr:req,context:req,   \
+        partial_block:req,temp0:req,temp1:req
+    str         \in_len,[\context,PARTIAL_BLOCK_LENGTH_OFF]
+    add         \partial_block,\context,PARTIAL_BLOCK_ENC_KEY_OFF
+    generic_load_partial_block \is_enc,\in_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1     /*  small_read_len >=8 */
+.endm
+/*partial_block_len+in_len < 16,partial_block_len=0,in_len>0 */
+.macro  paritial_block_small_length is_enc:req,context:req,in_len:req,in_adr:req,out_adr:req,temp0:req,temp1:req,Ctr:req
+
+    cbz     1f
+    ldr     \temp0,[\context,PARTIAL_BLOCK_LENGTH_OFF]
+    add     \temp1,\temp0,\in_len
+    str     \temp1,[\context,PARTIAL_BLOCK_LENGTH_OFF]
+    add     \context,\temp0,PARTIAL_BLOCK_ENC_KEY_OFF
+2:/* loop start */
+    sub     \in_len,\in_len,1
+    ldrb    w\temp0,[\in_adr],1
+    ldrb    w\temp1,[\context]
+    eor     w\temp1,w\temp1,w\temp0
+    strb    w\temp1,[\out_adr],1
+.ifc \is_enc , encrypt
+    strb    w\temp1,[\context],1
+.endif
+.ifc \is_enc,decrypt
+    strb    w\temp0,[\context],1
+.endif
+    cbnz     \in_len,2b
+1:/* loop end */
+.endm
+
+/* 0<in_len < 16,partial_block_len=0 */
+.macro  paritial_block_end  is_enc:req,context:req,in_len:req,in_adr:req,out_adr:req,   \
+    temp0:req,partial_block_len:req    \
+    PartialBlock:req,ctr:req,one:req,Tmp2:req,Tmp3:req,Tmp4:req
+    add         v\ctr\().4s,v\ctr\().4s,v\one\().4s    //increase ctr
+    str         q\ctr,[context,CTR_OFF]
+    read_small_data_start   \PartialBlock,\in_adr,\in_len,\tbl_adr,\Tmp0
+    aes_encrypt_block   \ctr
+
+.endm
+        declare_var_vector_reg  Key0 ,16
+        declare_var_vector_reg  Key1 ,17
+        declare_var_vector_reg  Key2 ,18
+        declare_var_vector_reg  Key3 ,19
+        declare_var_vector_reg  Key4 ,20
+        declare_var_vector_reg  Key5 ,21
+        declare_var_vector_reg  Key6 ,22
+        declare_var_vector_reg  Key7 ,23
+        declare_var_vector_reg  Key8 ,24
+        declare_var_vector_reg  Key9 ,25
+        declare_var_vector_reg  Key10,26
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_128.S
new file mode 100644
index 000000000..02add91a2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_128.S
@@ -0,0 +1,165 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define KEY_LEN 128
+#include "gcm_common.S"
+
+#define KEY_REGS 0,1,2,3,4,5,6,7,8
+.macro aes_encrypt_block    block:req
+    aes_encrypt_round       \block,Key0
+    aes_encrypt_round       \block,Key1
+    aes_encrypt_round       \block,Key2
+    aes_encrypt_round       \block,Key3
+    aes_encrypt_round       \block,Key4
+    aes_encrypt_round       \block,Key5
+    aes_encrypt_round       \block,Key6
+    aes_encrypt_round       \block,Key7
+    aes_encrypt_round       \block,Key8
+    aese    v\block\().16b,vKey9.16b
+    eor     v\block\().16b,v\block\().16b,vKey10.16b
+.endm
+
+/*
+    Load Aes Keys to [vKey0..vKey8,vKeyLast0,vKeyLast1]
+ */
+.macro load_aes_keys    key_addr:req
+    ld1     {vKey0.4s- vKey3.4s},[\key_addr],64
+    ld1     {vKey4.4s- vKey7.4s},[\key_addr],64
+    ldp     qKey8,qKey9,[\key_addr],32
+    ldr     qKey10,[\key_addr],15*16 - 128 - 32
+.endm
+
+
+
+/*
+    [low,middle,tmp0,high] +=aadhash * [hashkey0,hashkey0_ext]
+    dat=*dat_adr
+    enc_dat=aes_encrypt(ctr)^dat
+    aadhash=rbit(enc_dat)
+    [hashkey0,hashkey0_ext] = *hashkey_adr
+    dat_adr+=16
+    hashkey_adr+=32
+*/
+.macro  aes_gcm_middle   is_enc:req,aadhash:req,dat_adr:req,hashkey_adr:req,    \
+    hashkey0:req,hashkey0_ext:req,high:req,low:req,                             \
+    ctr:req,enc_ctr:req,one:req,out_adr:req,                                    \
+    tmp0:req,tmp1:req,left_count:req
+
+    pmull2   v\tmp0\().1q,v\aadhash\().2d,v\hashkey0\().2d
+    pmull    v\tmp1\().1q,v\aadhash\().1d,v\hashkey0\().1d
+    .if  \left_count > 1
+        ldr      q\hashkey0,[\hashkey_adr],16
+    .endif
+
+    add      v\ctr\().4s,v\ctr\().4s,v\one\().4s    //increase ctr
+
+    rev32    v\enc_ctr\().16b,v\ctr\().16b
+    aes_encrypt_round   \enc_ctr,Key0
+    eor      v\high\().16b,v\high\().16b,v\tmp0\().16b
+    pmull    v\tmp0\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d
+    eor      v\low\().16b,v\low\().16b,v\tmp1\().16b
+    pmull2   v\tmp1\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d
+    .if  \left_count > 1
+        ldr      q\hashkey0_ext,[\hashkey_adr],16
+    .endif
+    eor     v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b
+    aes_encrypt_round   \enc_ctr,Key1
+    aes_encrypt_round   \enc_ctr,Key2
+    eor      v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b
+    aes_encrypt_round   \enc_ctr,Key3
+    ext      v\tmp1\().16b,v\aadhash\().16b,v\tmp0\().16b,8
+    ext      v\tmp0\().16b,v\tmp0\().16b,v\aadhash\().16b,8
+    aes_encrypt_round   \enc_ctr,Key4
+    eor      v\low\().16b,v\low\().16b,v\tmp1\().16b
+    eor      v\high\().16b,v\high\().16b,v\tmp0\().16b
+    aes_encrypt_round   \enc_ctr,Key5
+    ldr      q\aadhash,[\dat_adr],16
+    aes_encrypt_round   \enc_ctr,Key6
+    aes_encrypt_round   \enc_ctr,Key7
+    aes_encrypt_round   \enc_ctr,Key8
+    aese    v\enc_ctr\().16b,vKey9.16b
+    eor     v\enc_ctr\().16b,v\enc_ctr\().16b,vKey10.16b
+    eor     v\enc_ctr\().16b,v\enc_ctr\().16b,v\aadhash\().16b
+    .ifc \is_enc, encrypt
+        rbit    v\aadhash\().16b,v\enc_ctr\().16b
+    .endif
+    .ifc \is_enc , decrypt
+        rbit    v\aadhash\().16b,v\aadhash\().16b
+    .endif
+    str     q\enc_ctr,[\out_adr],16
+.endm
+
+.macro  aes_gcm_init    is_enc:req,aadhash:req,dat_adr:req,hashkey_adr:req,     \
+    hashkey0:req,hashkey0_ext:req, high:req,low:req,                            \
+    ctr:req,enc_ctr:req,one:req,out_adr:req,                                    \
+    tmp0:req,tmp1:req,left_count:req
+    ldr     q\hashkey0,[\hashkey_adr],16
+    add      v\ctr\().4s,v\ctr\().4s,v\one\().4s    //increase ctr
+    rev32    v\enc_ctr\().16b,v\ctr\().16b
+    aes_encrypt_round   \enc_ctr,Key0
+    ldr     q\hashkey0_ext,[\hashkey_adr],16
+    aes_encrypt_round   \enc_ctr,Key1
+    pmull2   v\high\().1q,v\aadhash\().2d,v\hashkey0\().2d
+    pmull    v\low\().1q,v\aadhash\().1d,v\hashkey0\().1d
+
+    .if  \left_count > 1
+        ldr      q\hashkey0,[\hashkey_adr],16
+    .endif
+    aes_encrypt_round   \enc_ctr,Key2
+    pmull    v\tmp1\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d
+    pmull2   v\tmp0\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d
+    eor      v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b
+
+    .if  \left_count > 1
+        ldr      q\hashkey0_ext,[\hashkey_adr],16
+    .endif
+    aes_encrypt_round   \enc_ctr,Key3
+    eor      v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b
+
+    aes_encrypt_round   \enc_ctr,Key4
+    ext      v\tmp1\().16b,v\aadhash\().16b,v\tmp0\().16b,8 //low
+    ext      v\tmp0\().16b,v\tmp0\().16b,v\aadhash\().16b,8 //high
+    aes_encrypt_round   \enc_ctr,Key5
+    eor      v\low\().16b,v\low\().16b,v\tmp1\().16b
+    eor      v\high\().16b,v\high\().16b,v\tmp0\().16b
+    aes_encrypt_round   \enc_ctr,Key6
+    ldr      q\aadhash,[\dat_adr],16
+    aes_encrypt_round   \enc_ctr,Key7
+    aes_encrypt_round   \enc_ctr,Key8
+    aese    v\enc_ctr\().16b,vKey9.16b
+    eor     v\enc_ctr\().16b,v\enc_ctr\().16b,vKey10.16b
+    eor     v\enc_ctr\().16b,v\enc_ctr\().16b,v\aadhash\().16b
+    .ifc \is_enc , encrypt
+        rbit    v\aadhash\().16b,v\enc_ctr\().16b
+    .endif
+    .ifc \is_enc , decrypt
+        rbit    v\aadhash\().16b,v\aadhash\().16b
+    .endif
+    str     q\enc_ctr,[\out_adr],16
+.endm
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_256.S
new file mode 100644
index 000000000..fb6a6e94d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_256.S
@@ -0,0 +1,181 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define KEY_LEN 256
+#include "gcm_common.S"
+        declare_var_vector_reg  Key11,27
+        declare_var_vector_reg  Key12,28
+        declare_var_vector_reg  Key13,29
+        declare_var_vector_reg  Key14,30
+#define KEY_REGS 0,1,2,3,4,5,6,7,8,9,10,11,12
+.macro aes_encrypt_block    block:req
+    aes_encrypt_round       \block,Key0
+    aes_encrypt_round       \block,Key1
+    aes_encrypt_round       \block,Key2
+    aes_encrypt_round       \block,Key3
+    aes_encrypt_round       \block,Key4
+    aes_encrypt_round       \block,Key5
+    aes_encrypt_round       \block,Key6
+    aes_encrypt_round       \block,Key7
+    aes_encrypt_round       \block,Key8
+    aes_encrypt_round       \block,Key9
+    aes_encrypt_round       \block,Key10
+    aes_encrypt_round       \block,Key11
+    aes_encrypt_round       \block,Key12
+    aese    v\block\().16b,vKey13.16b
+    eor     v\block\().16b,v\block\().16b,vKey14.16b
+.endm
+
+/*
+    Load Aes Keys to [vKey0..vKey8,vKeyLast0,vKeyLast1]
+ */
+.macro load_aes_keys    key_addr:req
+    ld1     { vKey0.4s-  vKey3.4s},[\key_addr],64
+    ld1     { vKey4.4s-  vKey7.4s},[\key_addr],64
+    ld1     { vKey8.4s- vKey11.4s},[\key_addr],64
+    ld1     {vKey12.4s- vKey14.4s},[\key_addr],48
+.endm
+
+
+
+/*
+    [low,middle,tmp0,high] +=aadhash * [hashkey0,hashkey0_ext]
+    dat=*dat_adr
+    enc_dat=aes_encrypt(ctr)^dat
+    aadhash=rbit(enc_dat)
+    [hashkey0,hashkey0_ext] = *hashkey_adr
+    dat_adr+=16
+    hashkey_adr+=32
+*/
+.macro  aes_gcm_middle   is_enc:req,aadhash:req,dat_adr:req,hashkey_adr:req,    \
+    hashkey0:req,hashkey0_ext:req,high:req,low:req,                             \
+    ctr:req,enc_ctr:req,one:req,out_adr:req,                                    \
+    tmp0:req,tmp1:req,left_count:req
+
+    pmull2   v\tmp0\().1q,v\aadhash\().2d,v\hashkey0\().2d
+    pmull    v\tmp1\().1q,v\aadhash\().1d,v\hashkey0\().1d
+    .if  \left_count > 1
+        ldr      q\hashkey0,[\hashkey_adr],16
+    .endif
+
+    add      v\ctr\().4s,v\ctr\().4s,v\one\().4s    //increase ctr
+
+    rev32    v\enc_ctr\().16b,v\ctr\().16b
+    aes_encrypt_round   \enc_ctr,Key0
+    eor      v\high\().16b,v\high\().16b,v\tmp0\().16b
+    pmull    v\tmp0\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d
+    eor      v\low\().16b,v\low\().16b,v\tmp1\().16b
+    pmull2   v\tmp1\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d
+    .if  \left_count > 1
+        ldr      q\hashkey0_ext,[\hashkey_adr],16
+    .endif
+    eor     v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b
+    aes_encrypt_round   \enc_ctr,Key1
+    aes_encrypt_round   \enc_ctr,Key2
+    eor      v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b
+    aes_encrypt_round   \enc_ctr,Key3
+    ext      v\tmp1\().16b,v\aadhash\().16b,v\tmp0\().16b,8
+    ext      v\tmp0\().16b,v\tmp0\().16b,v\aadhash\().16b,8
+    aes_encrypt_round   \enc_ctr,Key4
+    eor      v\low\().16b,v\low\().16b,v\tmp1\().16b
+    eor      v\high\().16b,v\high\().16b,v\tmp0\().16b
+    aes_encrypt_round   \enc_ctr,Key5
+    ldr      q\aadhash,[\dat_adr],16
+    aes_encrypt_round   \enc_ctr,Key6
+    aes_encrypt_round   \enc_ctr,Key7
+    aes_encrypt_round   \enc_ctr,Key8
+    aes_encrypt_round   \enc_ctr,Key9
+    aes_encrypt_round   \enc_ctr,Key10
+    aes_encrypt_round   \enc_ctr,Key11
+    aes_encrypt_round   \enc_ctr,Key12
+    aese    v\enc_ctr\().16b,vKey13.16b
+    eor     v\enc_ctr\().16b,v\enc_ctr\().16b,vKey14.16b
+    eor     v\enc_ctr\().16b,v\enc_ctr\().16b,v\aadhash\().16b
+    .ifc \is_enc , encrypt
+        rbit    v\aadhash\().16b,v\enc_ctr\().16b
+    .endif
+    .ifc \is_enc , decrypt
+        rbit    v\aadhash\().16b,v\aadhash\().16b
+    .endif
+    str     q\enc_ctr,[\out_adr],16
+.endm
+
+.macro  aes_gcm_init    is_enc:req,aadhash:req,dat_adr:req,hashkey_adr:req,   \
+    hashkey0:req,hashkey0_ext:req, high:req,low:req,                        \
+    ctr:req,enc_ctr:req,one:req,out_adr:req,                                \
+    tmp0:req,tmp1:req,left_count:req
+    ldr     q\hashkey0,[\hashkey_adr],16
+    add      v\ctr\().4s,v\ctr\().4s,v\one\().4s    /*increase ctr */
+    rev32    v\enc_ctr\().16b,v\ctr\().16b
+    aes_encrypt_round   \enc_ctr,Key0
+    ldr     q\hashkey0_ext,[\hashkey_adr],16
+    aes_encrypt_round   \enc_ctr,Key1
+    pmull2   v\high\().1q,v\aadhash\().2d,v\hashkey0\().2d
+    pmull    v\low\().1q,v\aadhash\().1d,v\hashkey0\().1d
+
+    .if  \left_count > 1
+        ldr      q\hashkey0,[\hashkey_adr],16
+    .endif
+    aes_encrypt_round   \enc_ctr,Key2
+    pmull    v\tmp1\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d
+    pmull2   v\tmp0\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d
+    eor      v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b
+
+    .if  \left_count > 1
+        ldr      q\hashkey0_ext,[\hashkey_adr],16
+    .endif
+    aes_encrypt_round   \enc_ctr,Key3
+    eor      v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b
+
+    aes_encrypt_round   \enc_ctr,Key4
+    ext      v\tmp1\().16b,v\aadhash\().16b,v\tmp0\().16b,8 /*low */
+    ext      v\tmp0\().16b,v\tmp0\().16b,v\aadhash\().16b,8 /* high */
+    aes_encrypt_round   \enc_ctr,Key5
+    eor      v\low\().16b,v\low\().16b,v\tmp1\().16b
+    eor      v\high\().16b,v\high\().16b,v\tmp0\().16b
+    aes_encrypt_round   \enc_ctr,Key6
+    ldr      q\aadhash,[\dat_adr],16
+    aes_encrypt_round   \enc_ctr,Key7
+    aes_encrypt_round   \enc_ctr,Key8
+    aes_encrypt_round   \enc_ctr,Key9
+    aes_encrypt_round   \enc_ctr,Key10
+    aes_encrypt_round   \enc_ctr,Key11
+    aes_encrypt_round   \enc_ctr,Key12
+    aese    v\enc_ctr\().16b,vKey13.16b
+    eor     v\enc_ctr\().16b,v\enc_ctr\().16b,vKey14.16b
+    eor     v\enc_ctr\().16b,v\enc_ctr\().16b,v\aadhash\().16b
+    .ifc \is_enc , encrypt
+        rbit    v\aadhash\().16b,v\enc_ctr\().16b
+    .endif
+    .ifc \is_enc , decrypt
+        rbit    v\aadhash\().16b,v\aadhash\().16b
+    .endif
+    str     q\enc_ctr,[\out_adr],16
+.endm
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_enc_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_enc_dec.S
new file mode 100644
index 000000000..927179cfc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_enc_dec.S
@@ -0,0 +1,588 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+/*
+void gist_aes_gcm_dec_##mode(                                       \
+    const struct gcm_key_data *key_data,                            \
+    struct gcm_context_data *context,                               \
+    uint8_t *out,                                                   \
+    uint8_t const *in,                                              \
+    uint64_t len,                                                   \
+    uint8_t *iv,                                                    \
+                                                                    \
+    uint8_t const *aad,                                             \
+    uint64_t aad_len,                                               \
+    uint8_t *auth_tag,                                              \
+    uint64_t auth_tag_len                                           \
+                                                                    \
+    )
+ */
+
+    declare_var_generic_reg key_data    ,0
+    declare_var_generic_reg context     ,1
+    declare_var_generic_reg out         ,2
+    declare_var_generic_reg in          ,3
+    declare_var_generic_reg len         ,4
+    declare_var_generic_reg iv          ,5
+    declare_var_generic_reg aad         ,6
+    declare_var_generic_reg aad_len     ,7
+
+    declare_var_generic_reg hashkey_base,0
+    declare_var_generic_reg hashkey_addr,5
+    declare_var_generic_reg left_len    ,12
+    declare_var_generic_reg aad_left    ,13
+    declare_var_generic_reg temp0       ,14
+    declare_var_generic_reg temp1       ,15
+
+    declare_var_generic_reg auth_tag    ,0 /* input param */
+    declare_var_generic_reg auth_tag_len,1 /* input param */
+
+
+    declare_var_vector_reg  Ctr,0
+    declare_var_vector_reg  AadHash,1
+    declare_var_vector_reg  HashKey0,2
+    declare_var_vector_reg  HashKey0Ext,3
+    declare_var_vector_reg  High,4
+    declare_var_vector_reg  Low,5
+    declare_var_vector_reg  EncCtr,6
+    declare_var_vector_reg  Dat0,6
+    declare_var_vector_reg  Middle0,7
+
+    declare_var_vector_reg  Tmp0,8
+    declare_var_vector_reg  Tmp1,9
+    declare_var_vector_reg  Zero,10
+    declare_var_vector_reg  Poly,11
+    declare_var_vector_reg  LeftDat ,12
+    declare_var_vector_reg  Len     ,13
+    declare_var_vector_reg  Tmp2,14
+    declare_var_vector_reg  Tmp3,15
+
+    declare_var_vector_reg  One,31
+    .set        stack_size,64
+    .macro  push_stack
+        stp      d8, d9,[sp,-stack_size]!
+        stp     d10,d11,[sp,16]
+        stp     d12,d13,[sp,32]
+        stp     d14,d15,[sp,48]
+
+    .endm
+
+    .macro  pop_stack
+        ldp     d10,d11,[sp,16]
+        ldp     d12,d13,[sp,32]
+        ldp     d14,d15,[sp,48]
+        ldp     d8, d9, [sp], stack_size
+    .endm
+
+START_FUNC(enc,KEY_LEN,_)
+START_FUNC(enc,KEY_LEN,_nt_)
+    push_stack
+    /*save in_length and aad_length*/
+    stp             aad_len,len,[context,AAD_LEN_OFF]
+    load_aes_keys   key_data
+    /* Init Consts and IV */
+    mov             wtemp1,1
+    eor             vOne.16b,vOne.16b,vOne.16b
+    ld1             {vCtr.d}[0],[iv],8
+    eor             vZero.16b,vZero.16b,vZero.16b
+    ld1             {vCtr.s}[2],[iv]
+    mov             temp0,0x87
+    rev32           vCtr.16b,vCtr.16b /* to cpu order */
+    ins             vOne.s[3],wtemp1
+    mov             vAadHash.16b,vZero.16b
+    dup             vPoly.2d,temp0
+    ins             vCtr.s[3],wtemp1  /* Initial Ctr and Orig IV */
+
+
+    and             left_len,aad_len,0xf
+    cbz             aad_len,24f
+    lsr             aad_len,aad_len,4
+    /* Read small data */
+    cbz             left_len,2f     /* aad_len >= 16,skip */
+    add             aad_left,aad,aad_len,lsl 4
+    read_small_data_start   LeftDat,aad_left,left_len,temp0,Tmp0
+    cbnz            left_len,1f     /* aad_len & 0xf != 0 */
+2:
+    cbz             aad_len,1f      /* aad_len <16 skip*/
+    /* left_len == 0 && aad_len !=0 */
+    sub             aad_len,aad_len,1
+    /*  leftDat = aad[-1] */
+    ldr             qLeftDat,[aad,aad_len,lsl 4]
+1:
+    cbnz            aad_len,1f         /* aad_len >16,skip */
+    rbit            vAadHash.16b,vLeftDat.16b
+    b               24f                 /* aad_len <=16, skip aadhash caculate */
+1:
+    /* aad_len > 16 */
+    ldr             qAadHash,[aad],16
+    rbit            vAadHash.16b,vAadHash.16b
+    sub             aad_len,aad_len,1
+
+1:
+    /* loop ghash_block */
+    cmp             aad_len,HASHKEY_TOTAL_NUM - 1
+    bls             1f // break loop
+    sub             aad_len,aad_len,HASHKEY_TOTAL_NUM
+    ghash_block_n   HASHKEY_TOTAL_NUM,AadHash,Dat0,aad,hashkey_addr,hashkey_base,   \
+        HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly ,                           \
+        Tmp0,Tmp1
+    b               1b /* back to loop start */
+1:
+    cbnz            aad_len,1f          /* left aad_len >32,skip */
+    ldp             qHashKey0,qHashKey0Ext,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32]
+    ghash_block_reg AadHash,LeftDat,                            \
+        HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly ,       \
+        Tmp0
+    b               24f                 /* left aad_len <=32,skip below check */
+1:
+    mov             temp0,HASHKEY_TOTAL_NUM - 1
+    sub             temp0,temp0,aad_len
+    add             hashkey_addr,hashkey_base,temp0,lsl 5
+
+    ghash_mult_init_round   AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext,  \
+        High,Low,Middle0,Tmp0,Dat0,2        /* load next hash */
+    sub             aad_len,aad_len,1
+
+1:
+    cbz             aad_len,1f
+    ghash_mult_round        AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \
+        High,Low,Middle0,Tmp0,Tmp1,Dat0, 2
+
+    sub             aad_len,aad_len,1
+    b               1b
+1:
+    ghash_mult_round_noload AadHash,HashKey0,HashKey0Ext,High,Low,Middle0,Tmp0,Tmp1
+    rbit                    vAadHash.16b, vLeftDat.16b
+    ghash_mult_final_round  AadHash,High,Low,Middle0,Tmp0,Zero,Poly
+
+24:
+
+    /* Enc/Dec loop */
+    and             left_len,len,15
+    cbz             len,24f
+    lsr             len,len,4
+1:
+    /* loop aes gcm enc/dec loop */
+    cmp             len,HASHKEY_TOTAL_NUM - 1
+    bls             1f // break loop
+    sub             len,len,HASHKEY_TOTAL_NUM
+    aes_gcm_n_round   encrypt,HASHKEY_TOTAL_NUM,AadHash,in,hashkey_addr,hashkey_base,    \
+        HashKey0,HashKey0Ext,High,Low,Poly,      \
+        Ctr,EncCtr,One,out,Tmp0,Tmp1
+    b               1b /* back to loop start */
+1:
+    cbz             len,24f     /* left len == 0 */
+    mov             temp0,HASHKEY_TOTAL_NUM
+    sub             temp0,temp0,len
+    add             hashkey_addr,hashkey_base,temp0,lsl 5
+
+    sub             len,len,1
+    aes_gcm_init    encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext,  \
+        High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2        /* load next hash */
+    cbz             len,2f
+    sub             len,len,1
+1:
+
+    cbz             len,1f
+    aes_gcm_middle  encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+        High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2        /* load next hash */
+    sub             len,len,1
+    b               1b
+1:
+    aes_gcm_middle  encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+        High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,1        /* load next hash */
+2:
+    poly_mult_final_x2  AadHash,High,Low,Tmp0,Tmp1,Poly
+24:
+    /* complete part */
+    cmp         left_len,0
+    movi        vHigh.16b,0
+    mov         temp0,HASHKEY_TOTAL_NUM-3
+    movi        vLow.16b,0
+    cinc        hashkey_addr,temp0,eq
+    movi        vMiddle0.16b,0
+    add         hashkey_addr,hashkey_base,hashkey_addr,lsl 5
+    ldp         qHashKey0,qHashKey0Ext,[hashkey_addr],32
+    beq         2f
+    read_small_data_start LeftDat,in,left_len,temp0,Tmp0
+    add         vCtr.4s,vCtr.4s,vOne.4s
+    rev32       vEncCtr.16b,vCtr.16b
+    aes_encrypt_round   EncCtr,Key0
+    pmull2      vHigh.1q,vAadHash.2d,vHashKey0.2d
+    aes_encrypt_round   EncCtr,Key1
+    pmull       vLow.1q ,vAadHash.1d,vHashKey0.1d
+    aes_encrypt_round   EncCtr,Key2
+    ldr         qHashKey0,[hashkey_addr],16
+    aes_encrypt_round   EncCtr,Key3
+    pmull       vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d
+    aes_encrypt_round   EncCtr,Key4
+    pmull2      vTmp0.1q   ,vAadHash.2d,vHashKey0Ext.2d
+    aes_encrypt_round   EncCtr,Key5
+    ldr         qHashKey0Ext,[hashkey_addr],16
+    aes_encrypt_round   EncCtr,Key6
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+    aes_encrypt_round   EncCtr,Key7
+    aes_encrypt_round   EncCtr,Key8
+#if KEY_LEN==256
+    aes_encrypt_round   EncCtr,Key9
+    aes_encrypt_round   EncCtr,Key10
+    aes_encrypt_round   EncCtr,Key11
+    aes_encrypt_round   EncCtr,Key12
+    aese    vEncCtr.16b,vKey13.16b
+    eor     vEncCtr.16b,vEncCtr.16b,vKey14.16b
+#else
+    aese    vEncCtr.16b,vKey9.16b
+    eor     vEncCtr.16b,vEncCtr.16b,vKey10.16b
+#endif
+    eor     vEncCtr.16b,vEncCtr.16b,vLeftDat.16b
+    write_small_data_start  EncCtr,out,left_len,temp0,Tmp0
+    clear_small_data        EncCtr,Zero,left_len,temp0,Tmp0
+    rbit    vAadHash.16b,vEncCtr.16b
+2:
+
+    ldr         qLen,[context,AAD_LEN_OFF]                          /*  Len  */
+    mov         wtemp0,1                                            /*  Ek */
+    pmull2      vTmp0.1q ,vAadHash.2d,vHashKey0.2d                  /* auth_dat * HashKey[Total-2] */
+    shl         vLen.2d,vLen.2d,3                                   /*  Len  */
+    pmull       vTmp1.1q ,vAadHash.1d,vHashKey0.1d                  /* auth_dat * HashKey[Total-2] */
+    rev64       vLen.16b,vLen.16b                                   /*  Len  */
+    ins         vCtr.4s[3],wtemp0                                   /*  Ek */
+    ldr         qHashKey0,[hashkey_addr],16                         /* auth_dat * HashKey[Total-2] */
+    pmull       vTmp2.1q,vAadHash.1d,vHashKey0Ext.1d                /* auth_dat * HashKey[Total-2] */
+    rev32       vEncCtr.16b,vCtr.16b                                /*  Ek */
+    eor         vHigh.16b,vHigh.16b,vTmp0.16b                       /* auth_dat * HashKey[Total-2] */
+    pmull2      vTmp3.1q   ,vAadHash.2d,vHashKey0Ext.2d             /* auth_dat * HashKey[Total-2] */
+    rbit        vAadHash.16b,vLen.16b                               /*  Len  */
+
+    aes_encrypt_round       EncCtr,Key0                             /*  Ek */
+    eor         vLow.16b,vLow.16b,vTmp1.16b                         /* auth_dat * HashKey[Total-2] */
+    aes_encrypt_round       EncCtr,Key1                             /*  Ek */
+    ldr         qHashKey0Ext,[hashkey_addr],16                      /* auth_dat * HashKey[Total-2] */
+    aes_encrypt_round       EncCtr,Key2                             /*  Ek */
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp2.16b                 /* auth_dat * HashKey[Total-2] */
+    aes_encrypt_round       EncCtr,Key3                             /*  Ek */
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp3.16b                 /* auth_dat * HashKey[Total-2] */
+    aes_encrypt_round       EncCtr,Key4                             /*  Ek */
+
+    pmull2      vTmp0.1q,vAadHash.2d,vHashKey0.2d                   /* Len * HashKey[Total-1] */
+    pmull       vTmp1.1q ,vAadHash.1d,vHashKey0.1d                  /* Len * HashKey[Total-1] */
+    aes_encrypt_round       EncCtr,Key5                             /*  Ek */
+    aes_encrypt_round       EncCtr,Key6                             /*  Ek */
+    pmull       vTmp2.1q,vAadHash.1d,vHashKey0Ext.1d                /* Len * HashKey[Total-1] */
+    aes_encrypt_round       EncCtr,Key7                             /*  Ek */
+    eor         vHigh.16b,vHigh.16b,vTmp0.16b                       /* Len * HashKey[Total-1] */
+    pmull2      vTmp3.1q   ,vAadHash.2d,vHashKey0Ext.2d             /* Len * HashKey[Total-1] */
+    aes_encrypt_round       EncCtr,Key8                             /*  Ek */
+    eor         vLow.16b,vLow.16b,vTmp1.16b                         /* Len * HashKey[Total-1] */
+#if KEY_LEN==256
+    aes_encrypt_round       EncCtr,Key9                             /*  Ek */
+    aes_encrypt_round       EncCtr,Key10                            /*  Ek */
+    aes_encrypt_round       EncCtr,Key11                            /*  Ek */
+    aes_encrypt_round       EncCtr,Key12                            /*  Ek */
+    aese        vEncCtr.16b,vKey13.16b                              /*  Ek */
+    eor         vEncCtr.16b,vEncCtr.16b,vKey14.16b                  /*  Ek */
+#else
+    aese        vEncCtr.16b,vKey9.16b                               /*  Ek */
+    eor         vEncCtr.16b,vEncCtr.16b,vKey10.16b                  /*  Ek */
+#endif
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp2.16b                 /* Len * HashKey[Total-1] */
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp3.16b                 /* Len * HashKey[Total-1] */
+    rbit        vAadHash.16b,vEncCtr.16b                            /* Aad */
+
+    ghash_mult_final_round  AadHash,High,Low,Middle0,Tmp0,Zero,Poly
+
+    ldp         auth_tag,auth_tag_len,[sp,stack_size]               /* Adjust here : TODO TBD */
+    rbit        vAadHash.16b,vAadHash.16b                           /* Aad */
+
+
+    /* output auth_tag */
+    cmp         auth_tag_len,16
+    bne         1f
+    /* most likely auth_tag_len=16 */
+    str         qAadHash,[auth_tag]
+    pop_stack
+    ret
+1:  /* auth_tag_len=12 */
+    cmp         auth_tag_len,12
+    bne         1f
+    str         dAadHash,[auth_tag],8
+    st1         {vAadHash.s}[2],[auth_tag]
+    pop_stack
+    ret
+1:  /* auth_tag_len=8 */
+    str         dAadHash,[auth_tag]
+    pop_stack
+    ret
+END_FUNC(enc,KEY_LEN,_)
+END_FUNC(enc,KEY_LEN,_nt_)
+
+
+START_FUNC(dec,KEY_LEN,_)
+START_FUNC(dec,KEY_LEN,_nt_)
+    push_stack
+    /* save in_length and aad_length */
+    stp             aad_len,len,[context,AAD_LEN_OFF]
+    load_aes_keys   key_data
+    /* Init Consts and IV */
+    mov             wtemp1,1
+    eor             vOne.16b,vOne.16b,vOne.16b
+    ld1             {vCtr.d}[0],[iv],8
+    eor             vZero.16b,vZero.16b,vZero.16b
+    ld1             {vCtr.s}[2],[iv]
+    mov             temp0,0x87
+    rev32           vCtr.16b,vCtr.16b /* to cpu order */
+    mov             vAadHash.16b,vZero.16b
+    ins             vOne.s[3],wtemp1
+    dup             vPoly.2d,temp0
+    ins             vCtr.s[3],wtemp1  /* Initial Ctr and Orig IV */
+
+    ldp             qHashKey0,qHashKey0Ext,[hashkey_base]
+    and             left_len,aad_len,0xf
+    cbz             aad_len,24f
+    lsr             aad_len,aad_len,4
+    /* Read small data */
+    cbz             left_len,2f     /* aad_len >= 16,skip */
+    add             aad_left,aad,aad_len,lsl 4
+    read_small_data_start   LeftDat,aad_left,left_len,temp0,Tmp0
+    cbnz            left_len,1f     /* aad_len & 0xf != 0 */
+2:
+    cbz             aad_len,1f      /* aad_len <16 skip */
+    /* left_len == 0 && aad_len !=0 */
+    sub             aad_len,aad_len,1
+    /*  leftDat = aad[-1] */
+    ldr             qLeftDat,[aad,aad_len,lsl 4]
+1:
+    cbnz            aad_len,1f         /* aad_len >16,skip */
+    rbit            vAadHash.16b,vLeftDat.16b
+    b               24f                 /* aad_len <=16, skip aadhash caculate */
+1:
+    /* aad_len > 16 */
+    ldr             qAadHash,[aad],16
+    rbit            vAadHash.16b,vAadHash.16b
+    sub             aad_len,aad_len,1
+
+1:
+    /** loop ghash_block */
+    cmp             aad_len,HASHKEY_TOTAL_NUM - 1
+    bls             1f /* break loop */
+    sub             aad_len,aad_len,HASHKEY_TOTAL_NUM
+    ghash_block_n   HASHKEY_TOTAL_NUM,AadHash,Dat0,aad,hashkey_addr,hashkey_base,    \
+        HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly ,      \
+        Tmp0,Tmp1
+    b               1b /* back to loop start */
+1:
+    cbnz            aad_len,1f          /* left aad_len >32,skip */
+    ldp             qHashKey0,qHashKey0Ext,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32]
+    ghash_block_reg AadHash,LeftDat,                            \
+        HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly ,       \
+        Tmp0
+    b               24f                 /* left aad_len <=32,skip below check */
+1:
+    mov             temp0,HASHKEY_TOTAL_NUM - 1
+    sub             temp0,temp0,aad_len
+    add             hashkey_addr,hashkey_base,temp0,lsl 5
+
+    ghash_mult_init_round   AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext,  \
+        High,Low,Middle0,Tmp0,Dat0,2        /* load next hash */
+    sub             aad_len,aad_len,1
+
+1:
+    cbz             aad_len,1f
+    ghash_mult_round        AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \
+        High,Low,Middle0,Tmp0,Tmp1,Dat0, 2
+
+    sub             aad_len,aad_len,1
+    b               1b
+1:
+    ghash_mult_round_noload AadHash,HashKey0,HashKey0Ext,High,Low,Middle0,Tmp0,Tmp1
+    rbit                    vAadHash.16b, vLeftDat.16b
+    ghash_mult_final_round  AadHash,High,Low,Middle0,Tmp0,Zero,Poly
+
+24:
+
+
+    /* Enc/Dec loop */
+    and             left_len,len,15
+    cbz             len,24f
+    lsr             len,len,4
+1:
+    /* loop aes gcm enc/dec loop */
+    cmp             len,HASHKEY_TOTAL_NUM - 1
+    bls             1f // break loop
+    sub             len,len,HASHKEY_TOTAL_NUM
+    aes_gcm_n_round   decrypt,HASHKEY_TOTAL_NUM,AadHash,in,hashkey_addr,hashkey_base,    \
+        HashKey0,HashKey0Ext,High,Low,Poly,      \
+        Ctr,EncCtr,One,out,Tmp0,Tmp1
+    b               1b /* back to loop start */
+1:
+    cbz             len,24f     /* left len == 0 */
+    mov             temp0,HASHKEY_TOTAL_NUM
+    sub             temp0,temp0,len
+    add             hashkey_addr,hashkey_base,temp0,lsl 5
+
+    sub             len,len,1
+    aes_gcm_init    decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext,  \
+        High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2        /* load next hash */
+    cbz             len,2f
+    sub             len,len,1
+1:
+
+    cbz             len,1f
+    aes_gcm_middle  decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+        High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2        /* load next hash */
+    sub             len,len,1
+    b               1b
+1:
+    aes_gcm_middle  decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+        High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,1        /* load next hash */
+2:
+    poly_mult_final_x2  AadHash,High,Low,Tmp0,Tmp1,Poly
+24:
+    /* complete part */
+    cmp         left_len,0
+    movi        vHigh.16b,0
+    mov         temp0,21
+    movi        vLow.16b,0
+    cinc       hashkey_addr,temp0,eq
+    movi        vMiddle0.16b,0
+    add         hashkey_addr,hashkey_base,hashkey_addr,lsl 5
+    ldp         qHashKey0,qHashKey0Ext,[hashkey_addr],32
+    beq         2f
+    read_small_data_start LeftDat,in,left_len,temp0,Tmp0
+    add         vCtr.4s,vCtr.4s,vOne.4s
+    rev32       vEncCtr.16b,vCtr.16b
+    aes_encrypt_round   EncCtr,Key0
+    pmull2      vHigh.1q,vAadHash.2d,vHashKey0.2d
+    aes_encrypt_round   EncCtr,Key1
+    pmull       vLow.1q ,vAadHash.1d,vHashKey0.1d
+    aes_encrypt_round   EncCtr,Key2
+    ldr         qHashKey0,[hashkey_addr],16
+    aes_encrypt_round   EncCtr,Key3
+    pmull       vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d
+    aes_encrypt_round   EncCtr,Key4
+    pmull2      vTmp0.1q   ,vAadHash.2d,vHashKey0Ext.2d
+    aes_encrypt_round   EncCtr,Key5
+    ldr         qHashKey0Ext,[hashkey_addr],16
+    aes_encrypt_round   EncCtr,Key6
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+    aes_encrypt_round   EncCtr,Key7
+    aes_encrypt_round   EncCtr,Key8
+#if KEY_LEN==256
+    aes_encrypt_round   EncCtr,Key9
+    aes_encrypt_round   EncCtr,Key10
+    aes_encrypt_round   EncCtr,Key11
+    aes_encrypt_round   EncCtr,Key12
+    aese    vEncCtr.16b,vKey13.16b
+    eor     vEncCtr.16b,vEncCtr.16b,vKey14.16b
+    eor     vEncCtr.16b,vEncCtr.16b,vLeftDat.16b
+#endif
+#if KEY_LEN==128
+    aese    vEncCtr.16b,vKey9.16b
+    eor     vEncCtr.16b,vEncCtr.16b,vKey10.16b
+    eor     vEncCtr.16b,vEncCtr.16b,vLeftDat.16b
+#endif
+    write_small_data_start  EncCtr,out,left_len,temp0,Tmp0
+    rbit    vAadHash.16b,vLeftDat.16b
+
+2:
+
+    ldr         qLen,[context,AAD_LEN_OFF]                          /*  Len  */
+    mov         wtemp0,1                                            /*  Ek */
+    pmull2      vTmp0.1q ,vAadHash.2d,vHashKey0.2d                  /* auth_dat * HashKey[Total-2] */
+    shl         vLen.2d,vLen.2d,3                                   /*  Len  */
+    pmull       vTmp1.1q ,vAadHash.1d,vHashKey0.1d                  /* auth_dat * HashKey[Total-2] */
+    rev64       vLen.16b,vLen.16b                                   /*  Len  */
+    ins         vCtr.4s[3],wtemp0                                   /*  Ek */
+    ldr         qHashKey0,[hashkey_addr],16                         /* auth_dat * HashKey[Total-2] */
+    pmull       vTmp2.1q,vAadHash.1d,vHashKey0Ext.1d                /* auth_dat * HashKey[Total-2] */
+    rev32       vEncCtr.16b,vCtr.16b                                /*  Ek */
+    eor         vHigh.16b,vHigh.16b,vTmp0.16b                       /* auth_dat * HashKey[Total-2] */
+    pmull2      vTmp3.1q   ,vAadHash.2d,vHashKey0Ext.2d             /* auth_dat * HashKey[Total-2] */
+    rbit        vAadHash.16b,vLen.16b                               /*  Len  */
+
+    aes_encrypt_round       EncCtr,Key0                             /*  Ek */
+    eor         vLow.16b,vLow.16b,vTmp1.16b                         /* auth_dat * HashKey[Total-2] */
+    aes_encrypt_round       EncCtr,Key1                             /*  Ek */
+    ldr         qHashKey0Ext,[hashkey_addr],16                      /* auth_dat * HashKey[Total-2] */
+    aes_encrypt_round       EncCtr,Key2                             /*  Ek */
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp2.16b                 /* auth_dat * HashKey[Total-2] */
+    aes_encrypt_round       EncCtr,Key3                             /*  Ek */
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp3.16b                 /* auth_dat * HashKey[Total-2] */
+    aes_encrypt_round       EncCtr,Key4                             /*  Ek */
+
+    pmull2      vTmp0.1q,vAadHash.2d,vHashKey0.2d                   /* Len * HashKey[Total-1] */
+    pmull       vTmp1.1q ,vAadHash.1d,vHashKey0.1d                  /* Len * HashKey[Total-1] */
+    aes_encrypt_round       EncCtr,Key5                             /*  Ek */
+    aes_encrypt_round       EncCtr,Key6                             /*  Ek */
+    pmull       vTmp2.1q,vAadHash.1d,vHashKey0Ext.1d                /* Len * HashKey[Total-1] */
+    aes_encrypt_round       EncCtr,Key7                             /*  Ek */
+    eor         vHigh.16b,vHigh.16b,vTmp0.16b                       /* Len * HashKey[Total-1] */
+    pmull2      vTmp3.1q   ,vAadHash.2d,vHashKey0Ext.2d             /* Len * HashKey[Total-1] */
+    aes_encrypt_round       EncCtr,Key8                             /*  Ek */
+    eor         vLow.16b,vLow.16b,vTmp1.16b                         /* Len * HashKey[Total-1] */
+#if KEY_LEN==256
+    aes_encrypt_round       EncCtr,Key9                             /*  Ek */
+    aes_encrypt_round       EncCtr,Key10                            /*  Ek */
+    aes_encrypt_round       EncCtr,Key11                            /*  Ek */
+    aes_encrypt_round       EncCtr,Key12                            /*  Ek */
+    aese        vEncCtr.16b,vKey13.16b                              /*  Ek */
+    eor         vEncCtr.16b,vEncCtr.16b,vKey14.16b                  /*  Ek */
+#else
+    aese        vEncCtr.16b,vKey9.16b                               /*  Ek */
+    eor         vEncCtr.16b,vEncCtr.16b,vKey10.16b                  /*  Ek */
+#endif
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp2.16b                 /* Len * HashKey[Total-1] */
+    eor         vMiddle0.16b,vMiddle0.16b,vTmp3.16b                 /* Len * HashKey[Total-1] */
+    rbit        vAadHash.16b,vEncCtr.16b                            /* Aad */
+
+    ghash_mult_final_round  AadHash,High,Low,Middle0,Tmp0,Zero,Poly
+
+    ldp         auth_tag,auth_tag_len,[sp,stack_size]               /* Adjust here : TODO TBD */
+    rbit        vAadHash.16b,vAadHash.16b                           /* Aad */
+
+
+    /* output auth_tag */
+    cmp         auth_tag_len,16
+    bne         1f
+    /* most likely auth_tag_len=16 */
+    str         qAadHash,[auth_tag]
+    pop_stack
+    ret
+1:  /* auth_tag_len=12 */
+    cmp         auth_tag_len,12
+    bne         1f
+    str         dAadHash,[auth_tag],8
+    st1         {vAadHash.s}[2],[auth_tag]
+    pop_stack
+    ret
+1:  /* auth_tag_len=8 */
+    str         dAadHash,[auth_tag]
+    pop_stack
+    ret
+END_FUNC(dec,KEY_LEN,_)
+END_FUNC(dec,KEY_LEN,_nt_)
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_multibinary_aarch64.S
new file mode 100644
index 000000000..b5433a1df
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_multibinary_aarch64.S
@@ -0,0 +1,58 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aarch64_multibinary.h"
+
+mbin_interface     aes_gcm_enc_128
+mbin_interface     aes_gcm_dec_128
+mbin_interface     aes_gcm_precomp_128
+mbin_interface     aes_gcm_enc_256
+mbin_interface     aes_gcm_dec_256
+mbin_interface     aes_gcm_precomp_256
+
+
+mbin_interface     aes_gcm_enc_128_update
+mbin_interface     aes_gcm_enc_128_finalize
+mbin_interface     aes_gcm_dec_128_update
+mbin_interface     aes_gcm_dec_128_finalize
+mbin_interface     aes_gcm_enc_256_update
+mbin_interface     aes_gcm_enc_256_finalize
+mbin_interface     aes_gcm_dec_256_update
+mbin_interface     aes_gcm_dec_256_finalize
+
+mbin_interface     aes_gcm_init_256
+mbin_interface     aes_gcm_init_128
+mbin_interface     aes_gcm_enc_128_nt
+mbin_interface     aes_gcm_enc_128_update_nt
+mbin_interface     aes_gcm_dec_128_nt
+mbin_interface     aes_gcm_dec_128_update_nt
+mbin_interface     aes_gcm_enc_256_nt
+mbin_interface     aes_gcm_enc_256_update_nt
+mbin_interface     aes_gcm_dec_256_nt
+mbin_interface     aes_gcm_dec_256_update_nt
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_precomp.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_precomp.S
new file mode 100644
index 000000000..e555c9798
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_precomp.S
@@ -0,0 +1,83 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+/*
+void aes_gcm_precomp(struct gcm_key_data *key_data);
+*/
+    declare_var_generic_reg key_data    ,0
+    declare_var_generic_reg temp0       ,1
+    declare_var_generic_reg hashkey_base,0
+    declare_var_generic_reg hashkey_addr,1
+
+    declare_var_vector_reg  Low         ,0
+    declare_var_vector_reg  Middle0     ,1
+    declare_var_vector_reg  Middle1     ,2
+    declare_var_vector_reg  High        ,3
+    declare_var_vector_reg  HashKeyIter ,4
+    declare_var_vector_reg  HashKey     ,5
+    declare_var_vector_reg  HashKeyExt  ,6
+    declare_var_vector_reg  Poly        ,7
+    declare_var_vector_reg  Zero        ,31
+
+START_FUNC(precomp,KEY_LEN,_)
+    load_aes_keys   key_data
+    mov             temp0,0x87
+    eor             vZero.16b,vZero.16b,vZero.16b
+    eor             vHashKey.16b,vHashKey.16b,vHashKey.16b
+    dup             vPoly.2d,temp0
+    aes_encrypt_block   HashKey
+    add             hashkey_addr,hashkey_base,(HASHKEY_TOTAL_NUM-1)*32
+    rbit            vHashKey.16b,vHashKey.16b
+    ext             vHashKeyExt.16b,vHashKey.16b,vHashKey.16b,8
+    mov             vHashKeyIter.16b,vHashKey.16b
+    stp             qHashKey,qHashKeyExt,[hashkey_addr],-32
+
+1:
+    pmull           vMiddle0.1q,vHashKeyIter.1d,vHashKeyExt.1d
+    pmull2          vMiddle1.1q,vHashKeyIter.2d,vHashKeyExt.2d
+    pmull           vLow.1q    ,vHashKeyIter.1d,vHashKey.1d
+    eor             vMiddle0.16b,vMiddle0.16b,vMiddle1.16b
+    pmull2          vHigh.1q   ,vHashKeyIter.2d,vHashKey.2d
+    ext             vMiddle1.16b,vMiddle0.16b,vZero.16b,8 //high
+    ext             vMiddle0.16b,vZero.16b,vMiddle0.16b,8 //low
+    eor             vHigh.16b   ,vHigh.16b,vMiddle1.16b
+    eor             vLow.16b    ,vLow.16b ,vMiddle0.16b
+    pmull2          vMiddle0.1q ,vHigh.2d ,vPoly.2d
+    ext             vMiddle1.16b,vMiddle0.16b,vZero.16b,8 //high
+    ext             vMiddle0.16b,vZero.16b,vMiddle0.16b,8 //low
+    eor             vHigh.16b   ,vHigh.16b,vMiddle1.16b
+    eor             vLow.16b    ,vLow.16b ,vMiddle0.16b
+    pmull           vMiddle0.1q ,vHigh.1d ,vPoly.1d
+    eor             vHashKeyIter.16b,vLow.16b,vMiddle0.16b
+    ext             vLow.16b,vHashKeyIter.16b,vHashKeyIter.16b,8
+    stp             qHashKeyIter,qLow,[hashkey_addr],-32
+    cmp             hashkey_addr,hashkey_base
+    bcs             1b
+
+    ret
+END_FUNC(precomp,KEY_LEN,_)
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_update.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_update.S
new file mode 100644
index 000000000..d47c52212
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_update.S
@@ -0,0 +1,277 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+/*
+void gist_aes_gcm_dec_update_##mode(                                \
+    const struct gcm_key_data *key_data,                            \
+    struct gcm_context_data *context,                               \
+    uint8_t *out,                                                   \
+    const uint8_t *in,                                              \
+    uint64_t len                                                    \
+    )
+ */
+
+    declare_var_generic_reg key_data    ,0
+    declare_var_generic_reg context     ,1
+    declare_var_generic_reg out         ,2
+    declare_var_generic_reg in          ,3
+    declare_var_generic_reg len         ,4
+    declare_var_generic_reg partial_block_length,5
+    declare_var_generic_reg blocks      ,5
+    declare_var_generic_reg hashkey_base,0
+    declare_var_generic_reg hashkey_addr,6
+    declare_var_generic_reg temp0       ,14
+    declare_var_generic_reg temp1       ,15
+    declare_var_generic_reg temp2       ,13
+
+
+
+    declare_var_vector_reg  Ctr,0
+    declare_var_vector_reg  AadHash,1
+    declare_var_vector_reg  HashKey0,2
+    declare_var_vector_reg  HashKey0Ext,3
+    declare_var_vector_reg  High,4
+    declare_var_vector_reg  Low,5
+    declare_var_vector_reg  EncCtr,6
+    declare_var_vector_reg  Middle,7
+
+    declare_var_vector_reg  Tmp0,8
+    declare_var_vector_reg  Tmp1,9
+    declare_var_vector_reg  Zero,10
+    declare_var_vector_reg  Poly,11
+    declare_var_vector_reg  PartialBlock ,12
+    declare_var_vector_reg  One,31
+    .set        stack_size,48
+    .macro  push_stack
+        stp     d8, d9, [sp,-stack_size]!
+        stp     d10,d11,[sp,16]
+        stp     d12,d13,[sp,32]
+
+    .endm
+
+    .macro  pop_stack
+        ldp     d10,d11,[sp,16]
+        ldp     d12,d13,[sp,32]
+        ldp     d8, d9, [sp], stack_size
+    .endm
+/*
+    20:exit_without_popstack
+    21:start_of_mainloop
+    22:exit_with_popstack
+    23:partial_block_start
+ */
+START_FUNC(enc,KEY_LEN,_update_)
+START_FUNC(enc,KEY_LEN,_update_nt_)
+    ldr             temp0,[context,IN_LENGTH_OFF]             /*load in_length */
+    ldr             partial_block_length,[context,PARTIAL_BLOCK_LENGTH_OFF]
+    ldr             qAadHash,[context]
+    cbz             len,20f /** if(len==0)return; exit_without_popstack*/
+    push_stack
+    add             temp0,temp0,len                           /* temp0=temp0+len */
+    load_aes_keys   key_data
+    str             temp0,[context,IN_LENGTH_OFF]             /* save in_length */
+    /* Init Consts and IV */
+    ldr             qCtr,[context,CTR_OFF]
+    mov             wtemp1,1
+    eor             vOne.16b,vOne.16b,vOne.16b
+    mov             temp0,0x87
+    eor             vZero.16b,vZero.16b,vZero.16b
+    ins             vOne.s[3],wtemp1
+    dup             vPoly.2d,temp0
+    cbnz            partial_block_length,23f                  /* if(partial_block_length!=0) not normal case*/
+21: /* start_of_mainloop */
+    cbz             len,24f
+    lsr             blocks,len,4
+    cmp             blocks,HASHKEY_TOTAL_NUM - 1
+    and             len,len,0xf
+    /* loop aes gcm enc/dec loop */
+    bls             2f /* skip loop */
+1:
+    sub             blocks,blocks,HASHKEY_TOTAL_NUM
+    cmp             blocks,HASHKEY_TOTAL_NUM - 1
+    aes_gcm_n_round   encrypt,HASHKEY_TOTAL_NUM,AadHash,in,hashkey_addr,hashkey_base,    \
+        HashKey0,HashKey0Ext,High,Low,Poly,      \
+        Ctr,EncCtr,One,out,Tmp0,Tmp1
+    bhi             1b /* back to loop start */
+2:
+    cbz             blocks,4f     // left blocks == 0
+    /* -(blocks - HASHKEY_TOTAL_NUM) */
+    sub             temp0,blocks,HASHKEY_TOTAL_NUM
+    neg             temp0,temp0
+    sub             blocks,blocks,1
+    add             hashkey_addr,hashkey_base,temp0,lsl 5
+
+    aes_gcm_init    encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext,  \
+        High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2        /* load next hash */
+    cbz             blocks,3f   /* origin_blocks == 1 */
+    sub             blocks,blocks,1
+
+    cbz             blocks,2f   /* origin_blocks == 2 */
+1:
+    sub             blocks,blocks,1
+    aes_gcm_middle  encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+        High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2        /* load next hash */
+    cbnz            blocks,1b
+2:
+    aes_gcm_middle  encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+        High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,1        /* not load next hash */
+3:
+    poly_mult_final_x2  AadHash,High,Low,Tmp0,Tmp1,Poly
+4:
+    str                 qAadHash,[context]
+    str                 qCtr,[context,CTR_OFF]
+    cbnz                len,24f
+22: /* exit_with_popstack */
+    pop_stack
+20: /* exit_without_popstack */
+    ret
+23: /* partial_block_start */
+
+    generic_partial_block_start    encrypt,len,in,out,context, \
+        temp2,partial_block_length,temp0,temp1,hashkey_addr
+    cbnz        partial_block_length,22b
+    ldr         qHashKey0Ext,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32+16]
+    ldr         qHashKey0   ,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32]
+    ldr         qPartialBlock,[context,PARTIAL_BLOCK_ENC_KEY_OFF]
+    ghash_block_reg AadHash,PartialBlock,HashKey0,HashKey0Ext, \
+        High,Low,Middle,Zero,Poly,Tmp0
+    str                 qAadHash,[context]
+    cbz         len,4b
+    cmp         len,15
+    bhi         21b
+24: /*partial_block_end */
+    add         vCtr.4s,vCtr.4s,vOne.4s
+    read_small_data_start   PartialBlock,in,len,temp0,Tmp0
+    rev32       vEncCtr.16b,vCtr.16b
+    str         qCtr,[context,CTR_OFF]
+    aes_encrypt_block   EncCtr
+    eor     vPartialBlock.16b,vPartialBlock.16b,vEncCtr.16b
+    str     qPartialBlock,[context,PARTIAL_BLOCK_ENC_KEY_OFF]
+    write_small_data_start  PartialBlock,out,len,temp0,Tmp0
+    str                     len,[context,PARTIAL_BLOCK_LENGTH_OFF]
+    pop_stack
+    ret
+
+END_FUNC(enc,KEY_LEN,_update_)
+END_FUNC(enc,KEY_LEN,_update_nt_)
+
+
+START_FUNC(dec,KEY_LEN,_update_)
+START_FUNC(dec,KEY_LEN,_update_nt_)
+    ldr             temp0,[context,IN_LENGTH_OFF]             /*load in_length */
+    ldr             partial_block_length,[context,PARTIAL_BLOCK_LENGTH_OFF]
+    ldr             qAadHash,[context]
+    cbz             len,20f /** if(len==0)return; exit_without_popstack*/
+    push_stack
+    add             temp0,temp0,len                           /* temp0=temp0+len */
+    load_aes_keys   key_data
+    str             temp0,[context,IN_LENGTH_OFF]             /* save in_length */
+    /* Init Consts and IV */
+    ldr             qCtr,[context,CTR_OFF]
+    mov             wtemp1,1
+    eor             vOne.16b,vOne.16b,vOne.16b
+    mov             temp0,0x87
+    eor             vZero.16b,vZero.16b,vZero.16b
+    ins             vOne.s[3],wtemp1
+    dup             vPoly.2d,temp0
+    cbnz            partial_block_length,23f                  /* if(partial_block_length!=0) not normal case*/
+21: /* start_of_mainloop */
+    cbz             len,24f
+    lsr             blocks,len,4
+    cmp             blocks,HASHKEY_TOTAL_NUM - 1
+    and             len,len,0xf
+    /** loop aes gcm enc/dec loop */
+    bls             2f /* skip loop */
+1:
+    sub             blocks,blocks,HASHKEY_TOTAL_NUM
+    cmp             blocks,HASHKEY_TOTAL_NUM - 1
+    aes_gcm_n_round   decrypt,HASHKEY_TOTAL_NUM,AadHash,in,hashkey_addr,hashkey_base,    \
+        HashKey0,HashKey0Ext,High,Low,Poly,      \
+        Ctr,EncCtr,One,out,Tmp0,Tmp1
+    bhi             1b /* back to loop start */
+2:
+    cbz             blocks,4f     /* left blocks == 0 */
+    /* -(blocks - HASHKEY_TOTAL_NUM) */
+    sub             temp0,blocks,HASHKEY_TOTAL_NUM
+    neg             temp0,temp0
+    sub             blocks,blocks,1
+    add             hashkey_addr,hashkey_base,temp0,lsl 5
+
+    aes_gcm_init    decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext,  \
+        High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2        // load next hash
+    cbz             blocks,3f   /* origin_blocks == 1 */
+    sub             blocks,blocks,1
+
+    cbz             blocks,2f   /* origin_blocks == 2 */
+1:
+    sub             blocks,blocks,1
+    aes_gcm_middle  decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+        High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2        /* load next hash */
+    cbnz            blocks,1b
+2:
+    aes_gcm_middle  decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+        High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,1        /* not load next hash */
+3:
+    poly_mult_final_x2  AadHash,High,Low,Tmp0,Tmp1,Poly
+4:
+    str                 qAadHash,[context]
+    str                 qCtr,[context,CTR_OFF]
+    cbnz                len,24f
+22: /* exit_with_popstack */
+    pop_stack
+20: /* exit_without_popstack */
+    ret
+23: /* partial_block_start */
+
+    generic_partial_block_start    decrypt,len,in,out,context, \
+        temp2,partial_block_length,temp0,temp1,hashkey_addr
+    cbnz        partial_block_length,22b
+    ldr         qHashKey0Ext,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32+16]
+    ldr         qHashKey0   ,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32]
+    ldr         qPartialBlock,[context,PARTIAL_BLOCK_ENC_KEY_OFF]
+    ghash_block_reg AadHash,PartialBlock,HashKey0,HashKey0Ext, \
+        High,Low,Middle,Zero,Poly,Tmp0
+    str                 qAadHash,[context]
+    cbz         len,4b
+    cmp         len,15
+    bhi         21b
+24: /* partial_block_end */
+    add         vCtr.4s,vCtr.4s,vOne.4s
+    read_small_data_start   PartialBlock,in,len,temp0,Tmp0
+    rev32       vEncCtr.16b,vCtr.16b
+    str         qCtr,[context,CTR_OFF]
+    aes_encrypt_block   EncCtr
+    eor     vEncCtr.16b,vPartialBlock.16b,vEncCtr.16b
+    tbx_small_data_start    EncCtr,PartialBlock,len,temp0,Tmp0
+    write_small_data_start  EncCtr,out,len,temp0,Tmp0
+    str                     qPartialBlock,[context,PARTIAL_BLOCK_ENC_KEY_OFF]
+    str                     len,[context,PARTIAL_BLOCK_LENGTH_OFF]
+    pop_stack
+    ret
+END_FUNC(dec,KEY_LEN,_update_)
+END_FUNC(dec,KEY_LEN,_update_nt_)
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_128_aarch64_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_128_aarch64_aes.S
new file mode 100644
index 000000000..4a3e990c3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_128_aarch64_aes.S
@@ -0,0 +1,134 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.arch armv8-a+crypto
+
+    .text
+/*
+Macros
+*/
+#define NUM_ROUNDS(a) (7+(a)/32)
+.macro  declare_var_vector_reg name:req,reg:req
+    q\name      .req    q\reg
+    v\name      .req    v\reg
+    s\name      .req    s\reg
+.endm
+.macro  round_128 off:req,rcon:req
+    .if   \off == 0
+        ldp     w_tmp2,w_tmp3,[key,8]
+        ldp     w_tmp0,w_tmp1,[key]
+        movi    vzero.4s,0
+        dup     vsrc.4s,w_tmp3
+        stp     w_tmp2,w_tmp3,[exp_key_enc,8]
+        stp     w_tmp0,w_tmp1,[exp_key_enc]
+    .endif
+        mov     w0,\rcon
+        mov     vdest.16b,vzero.16b
+        aese    vdest.16b,vsrc.16b
+        mov     w_tmp4,vdest.s[0]
+        eor     w_tmp0,w_tmp0,w0
+        eor     w_tmp0,w_tmp0,w_tmp4,ror 8
+        eor     w_tmp1,w_tmp0,w_tmp1
+        eor     w_tmp2,w_tmp1,w_tmp2
+        eor     w_tmp3,w_tmp2,w_tmp3
+        stp     w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*\off+KEY_LEN]
+        stp     w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*\off+8+KEY_LEN]
+    .if  \off != 10
+        dup     vsrc.4s,w_tmp3
+    .endif
+.endm
+.macro  export_dec_key  rounds:req,enc_key:req,dec_key:req
+    ldr     q0,[\enc_key]
+    ldr     q1,[\enc_key,(\rounds-1)*16]
+    str     q0,[\dec_key,(\rounds-1)*16]
+    str     q1,[\dec_key]
+    ldp     q0,q1,[\enc_key,1*16]
+    ldp     q2,q3,[\enc_key,(1+2)*16]
+    ldp     q4,q5,[\enc_key,(1+4)*16]
+    aesimc  v0.16b,v0.16b
+    aesimc  v1.16b,v1.16b
+    ldp     q6,q7,[\enc_key,(1+6)*16]
+    aesimc  v2.16b,v2.16b
+    aesimc  v3.16b,v3.16b
+    stp     q1,q0,[\dec_key,(\rounds-1-2)*16]
+    aesimc  v4.16b,v4.16b
+    aesimc  v5.16b,v5.16b
+    stp     q3,q2,[\dec_key,(\rounds-1-4)*16]
+    ldr     q0,[\enc_key,(1+8)*16]
+    aesimc  v6.16b,v6.16b
+    aesimc  v7.16b,v7.16b
+    stp     q5,q4,[\dec_key,(\rounds-1-6)*16]
+    aesimc  v0.16b,v0.16b
+    stp     q7,q6,[\dec_key,(\rounds-1-8)*16]
+    str     q0,[\dec_key,(\rounds-1-9)*16]
+.endm
+/**
+    void aes_keyexp_128_aes(const uint8_t * key,
+        uint8_t * exp_key_enc, uint8_t * exp_key_dec)
+*/
+    key         .req    x0
+    exp_key_enc .req    x1
+    exp_key_dec .req    x2
+    .equ        KEY_LEN, (128/8)
+    w_tmp0      .req    w3
+    w_tmp1      .req    w4
+    w_tmp2      .req    w5
+    w_tmp3      .req    w6
+    w_tmp4      .req    w7
+    declare_var_vector_reg dest,0
+    declare_var_vector_reg zero,1
+    declare_var_vector_reg src, 2
+
+
+    .global aes_keyexp_128_aes
+    .type   aes_keyexp_128_aes, %function
+
+aes_keyexp_128_aes:
+    .set    rcon,1
+    .set    off,0
+    .rept   10
+        round_128   off,rcon
+        .set        off,off+1
+        .set        rcon,(rcon << 1) ^ ((rcon >> 7) * 0x11b)
+    .endr
+
+    export_dec_key  NUM_ROUNDS(128),exp_key_enc,exp_key_dec
+    ret
+    .size   aes_keyexp_128_aes, .-aes_keyexp_128_aes
+    .global aes_keyexp_128_enc_aes
+    .type   aes_keyexp_128_enc_aes, %function
+aes_keyexp_128_enc_aes:
+    .set    rcon,1
+    .set    off,0
+    .rept   10
+        round_128   off,rcon
+        .set        off,off+1
+        .set        rcon,(rcon << 1) ^ ((rcon >> 7) * 0x11b)
+    .endr
+    ret
+    .size   aes_keyexp_128_enc_aes, .-aes_keyexp_128_enc_aes
+\ No newline at end of file
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_192_aarch64_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_192_aarch64_aes.S
new file mode 100644
index 000000000..2ba46060c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_192_aarch64_aes.S
@@ -0,0 +1,136 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+    .arch armv8-a+crypto
+    .text
+/*
+Macros
+*/
+#define NUM_ROUNDS(a) (7+(a)/32)
+.macro  declare_var_vector_reg name:req,reg:req
+        q\name      .req        q\reg
+        v\name      .req        v\reg
+        s\name      .req        s\reg
+.endm
+.macro  round_192 off:req,rcon:req
+    .if   \off == 0
+        ldp     w_tmp0,w_tmp1,[key]
+        ldp     w_tmp2,w_tmp3,[key,8]
+        ldp     w_tmp4,w_tmp5,[key,16]
+        movi    vzero.4s,0
+        dup     vsrc.4s,w_tmp5
+        stp     w_tmp0,w_tmp1,[exp_key_enc]
+        stp     w_tmp4,w_tmp5,[exp_key_enc,16]
+        stp     w_tmp2,w_tmp3,[exp_key_enc,8]
+    .endif
+        mov     w0,\rcon
+        mov     vdest.16b,vzero.16b
+        aese    vdest.16b,vsrc.16b
+        mov     w_tmp,vdest.s[0]
+        eor     w_tmp0,w_tmp0,w0
+        eor     w_tmp0,w_tmp0,w_tmp,ror 8
+        eor     w_tmp1,w_tmp0,w_tmp1
+        eor     w_tmp2,w_tmp1,w_tmp2
+        eor     w_tmp3,w_tmp2,w_tmp3
+    .if \off < 7
+        eor     w_tmp4,w_tmp4,w_tmp3
+        eor     w_tmp5,w_tmp5,w_tmp4
+        dup     vsrc.4s,w_tmp5
+        stp     w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*(\off+1)]
+        stp     w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*(\off+1)+8]
+        stp     w_tmp4,w_tmp5,[exp_key_enc,KEY_LEN*(\off+1)+16]
+    .else
+        stp     w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*(\off+1)]
+        stp     w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*(\off+1)+8]
+    .endif
+.endm
+
+.macro  export_dec_key  rounds:req,enc_key:req,dec_key:req
+    ldr     q0,[\enc_key]
+    ldr     q1,[\enc_key,(\rounds-1)*16]
+    str     q0,[\dec_key,(\rounds-1)*16]
+    str     q1,[\dec_key]
+    ldp     q0,q1,[\enc_key,1*16]
+    ldp     q2,q3,[\enc_key,(1+2)*16]
+    ldp     q4,q5,[\enc_key,(1+4)*16]
+    aesimc  v0.16b,v0.16b
+    aesimc  v1.16b,v1.16b
+    ldp     q6,q7,[\enc_key,(1+6)*16]
+    aesimc  v2.16b,v2.16b
+    aesimc  v3.16b,v3.16b
+    stp     q1,q0,[\dec_key,(\rounds-1-2)*16]
+    ldp     q0,q1,[\enc_key,(1+8)*16]
+    aesimc  v4.16b,v4.16b
+    aesimc  v5.16b,v5.16b
+    stp     q3,q2,[\dec_key,(\rounds-1-4)*16]
+    aesimc  v6.16b,v6.16b
+    aesimc  v7.16b,v7.16b
+    stp     q5,q4,[\dec_key,(\rounds-1-6)*16]
+    ldr     q2,[\enc_key,(1+10)*16]
+    aesimc  v0.16b,v0.16b
+    aesimc  v1.16b,v1.16b
+    stp     q7,q6,[\dec_key,(\rounds-1-8)*16]
+    aesimc  v2.16b,v2.16b
+    stp     q1,q0,[\dec_key,(\rounds-1-10)*16]
+    str     q2,[\dec_key,(\rounds-1-11)*16]
+.endm
+/**
+    void aes_keyexp_192_aes(const uint8_t * key,
+        uint8_t * exp_key_enc, uint8_t * exp_key_dec)
+*/
+    key         .req    x0
+    exp_key_enc .req    x1
+    exp_key_dec .req    x2
+    .equ        KEY_LEN,     (192/8)
+    w_tmp0      .req    w3
+    w_tmp1      .req    w4
+    w_tmp2      .req    w5
+    w_tmp3      .req    w6
+    w_tmp       .req    w7
+    w_tmp4      .req    w9
+    w_tmp5      .req    w10
+    declare_var_vector_reg dest,0
+    declare_var_vector_reg zero,1
+    declare_var_vector_reg src, 2
+
+
+    .global aes_keyexp_192_aes
+    .type       aes_keyexp_192_aes, %function
+
+aes_keyexp_192_aes:
+    .set    rcon,1
+    .set    off,0
+    .rept   8
+        round_192   off,rcon
+        .set        off,off+1
+        .set        rcon,(rcon << 1) ^ ((rcon >> 7) * 0x11b)
+    .endr
+    export_dec_key  NUM_ROUNDS(192),exp_key_enc,exp_key_dec
+    ret
+    .size       aes_keyexp_192_aes, .-aes_keyexp_192_aes
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_256_aarch64_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_256_aarch64_aes.S
new file mode 100644
index 000000000..5433b2ff6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_256_aarch64_aes.S
@@ -0,0 +1,153 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+    .arch armv8-a+crypto
+
+    .text
+/*
+Macros
+*/
+#define NUM_ROUNDS(a) (7+(a)/32)
+.macro  declare_var_vector_reg name:req,reg:req
+        q\name      .req    q\reg
+        v\name      .req    v\reg
+        s\name      .req    s\reg
+.endm
+.macro  round_256 off:req,rcon:req,export_dec_key
+    .if   \off == 0
+        ldp     w_tmp6,w_tmp7,[key,24]
+        ldp     w_tmp0,w_tmp1,[key]
+        ldp     w_tmp2,w_tmp3,[key,8]
+        ldp     w_tmp4,w_tmp5,[key,16]
+        movi    vzero.4s,0
+        dup     vsrc.4s,w_tmp7
+        stp     w_tmp6,w_tmp7,[exp_key_enc,24]
+        stp     w_tmp0,w_tmp1,[exp_key_enc]
+        stp     w_tmp4,w_tmp5,[exp_key_enc,16]
+        stp     w_tmp2,w_tmp3,[exp_key_enc,8]
+    .endif
+        mov     w0,\rcon
+        mov     vdest.16b,vzero.16b
+        aese    vdest.16b,vsrc.16b
+        mov     w_tmp,vdest.s[0]
+        eor     w_tmp0,w_tmp0,w0
+        eor     w_tmp0,w_tmp0,w_tmp,ror 8
+        eor     w_tmp1,w_tmp0,w_tmp1
+        eor     w_tmp2,w_tmp1,w_tmp2
+        eor     w_tmp3,w_tmp2,w_tmp3
+    .if \off < 6
+        dup     vsrc.4s,w_tmp3
+        mov     vdest.16b,vzero.16b
+        aese    vdest.16b,vsrc.16b
+        mov     w_tmp,vdest.s[0]
+        eor     w_tmp4,w_tmp4,w_tmp
+        eor     w_tmp5,w_tmp5,w_tmp4
+        eor     w_tmp6,w_tmp6,w_tmp5
+        eor     w_tmp7,w_tmp7,w_tmp6
+        dup     vsrc.4s,w_tmp7
+        stp     w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*(\off+1)]
+        stp     w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*(\off+1)+8]
+        stp     w_tmp4,w_tmp5,[exp_key_enc,KEY_LEN*(\off+1)+16]
+        stp     w_tmp6,w_tmp7,[exp_key_enc,KEY_LEN*(\off+1)+24]
+    .else
+        stp     w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*(\off+1)]
+        stp     w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*(\off+1)+8]
+    .endif
+.endm
+
+.macro  export_dec_key  rounds:req,enc_key:req,dec_key:req
+    ldr     q0,[\enc_key]
+    ldr     q1,[\enc_key,(\rounds-1)*16]
+    str     q0,[\dec_key,(\rounds-1)*16]
+    str     q1,[\dec_key]
+    ldp     q0,q1,[\enc_key,1*16]
+    ldp     q2,q3,[\enc_key,(1+2)*16]
+    ldp     q4,q5,[\enc_key,(1+4)*16]
+    aesimc  v0.16b,v0.16b
+    aesimc  v1.16b,v1.16b
+    ldp     q6,q7,[\enc_key,(1+6)*16]
+    aesimc  v2.16b,v2.16b
+    aesimc  v3.16b,v3.16b
+    stp     q1,q0,[\dec_key,(\rounds-1-2)*16]
+    ldp     q0,q1,[\enc_key,(1+8)*16]
+    aesimc  v4.16b,v4.16b
+    aesimc  v5.16b,v5.16b
+    stp     q3,q2,[\dec_key,(\rounds-1-4)*16]
+    ldp     q2,q3,[\enc_key,(1+10)*16]
+
+    aesimc  v6.16b,v6.16b
+    aesimc  v7.16b,v7.16b
+    stp     q5,q4,[\dec_key,(\rounds-1-6)*16]
+    ldr     q4,[\enc_key,(1+12)*16]
+    aesimc  v0.16b,v0.16b
+    aesimc  v1.16b,v1.16b
+    stp     q7,q6,[\dec_key,(\rounds-1-8)*16]
+    aesimc  v2.16b,v2.16b
+    aesimc  v3.16b,v3.16b
+    stp     q1,q0,[\dec_key,(\rounds-1-10)*16]
+    aesimc  v4.16b,v4.16b
+    stp     q3,q2,[\dec_key,(\rounds-1-12)*16]
+    str     q4,[\dec_key,(\rounds-1-13)*16]
+.endm
+/**
+    void aes_keyexp_256_aes(const uint8_t * key,
+        uint8_t * exp_key_enc, uint8_t * exp_key_dec)
+*/
+    key         .req    x0
+    exp_key_enc .req    x1
+    exp_key_dec .req    x2
+    .equ        KEY_LEN, (256/8)
+    w_tmp0      .req    w3
+    w_tmp1      .req    w4
+    w_tmp2      .req    w5
+    w_tmp3      .req    w6
+    w_tmp       .req    w7
+    w_tmp4      .req    w9
+    w_tmp5      .req    w10
+    w_tmp6      .req    w11
+    w_tmp7      .req    w12
+    declare_var_vector_reg dest,0
+    declare_var_vector_reg zero,1
+    declare_var_vector_reg src, 2
+
+
+    .global aes_keyexp_256_aes
+    .type   aes_keyexp_256_aes, %function
+
+aes_keyexp_256_aes:
+    .set    rcon,1
+    .set    off,0
+    .rept   7
+        round_256   off,rcon,1
+        .set        off,off+1
+        .set        rcon,(rcon << 1) ^ ((rcon >> 7) * 0x11b)
+    .endr
+    export_dec_key  NUM_ROUNDS(256),exp_key_enc,exp_key_dec
+    ret
+    .size   aes_keyexp_256_aes, .-aes_keyexp_256_aes
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_aarch64_dispatcher.c
new file mode 100644
index 000000000..14c9889ac
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_aarch64_dispatcher.c
@@ -0,0 +1,72 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+#undef PROVIDER_BASIC
+#define PROVIDER_BASIC(a) (void*)0
+
+DEFINE_INTERFACE_DISPATCHER(aes_keyexp_128)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES))
+		return PROVIDER_INFO(aes_keyexp_128_aes);
+
+	return PROVIDER_BASIC(aes_keyexp_128);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_keyexp_128_enc)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES))
+		return PROVIDER_INFO(aes_keyexp_128_enc_aes);
+
+	return PROVIDER_BASIC(aes_keyexp_128_enc);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_keyexp_192)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES))
+		return PROVIDER_INFO(aes_keyexp_192_aes);
+
+	return PROVIDER_BASIC(aes_keyexp_192);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_keyexp_256)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES))
+		return PROVIDER_INFO(aes_keyexp_256_aes);
+
+	return PROVIDER_BASIC(aes_keyexp_256);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_multibinary_aarch64.S
new file mode 100644
index 000000000..aa7c32576
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_multibinary_aarch64.S
@@ -0,0 +1,35 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aarch64_multibinary.h"
+
+mbin_interface     aes_keyexp_128
+mbin_interface     aes_keyexp_128_enc
+mbin_interface     aes_keyexp_192
+mbin_interface     aes_keyexp_256
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aarch64_dispatcher.c
new file mode 100644
index 000000000..6c918858e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aarch64_dispatcher.c
@@ -0,0 +1,102 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+#undef PROVIDER_BASIC
+#define PROVIDER_BASIC(a) (void*)0
+
+static unsigned long is_crypto_available(void)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	return (auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES);
+}
+
+DEFINE_INTERFACE_DISPATCHER(XTS_AES_128_enc)
+{
+	if (is_crypto_available()) {
+		return PROVIDER_INFO(XTS_AES_128_enc_ce);
+	}
+	return PROVIDER_BASIC(XTS_AES_128_enc);
+}
+
+DEFINE_INTERFACE_DISPATCHER(XTS_AES_128_dec)
+{
+	if (is_crypto_available()) {
+		return PROVIDER_INFO(XTS_AES_128_dec_ce);
+	}
+	return PROVIDER_BASIC(XTS_AES_128_dec);
+}
+
+DEFINE_INTERFACE_DISPATCHER(XTS_AES_128_enc_expanded_key)
+{
+	if (is_crypto_available()) {
+		return PROVIDER_INFO(XTS_AES_128_enc_expanded_key_ce);
+	}
+	return PROVIDER_BASIC(XTS_AES_128_enc_expanded_key);
+}
+
+DEFINE_INTERFACE_DISPATCHER(XTS_AES_128_dec_expanded_key)
+{
+	if (is_crypto_available()) {
+		return PROVIDER_INFO(XTS_AES_128_dec_expanded_key_ce);
+	}
+	return PROVIDER_BASIC(XTS_AES_128_dec_expanded_key);
+}
+
+DEFINE_INTERFACE_DISPATCHER(XTS_AES_256_enc)
+{
+	if (is_crypto_available()) {
+		return PROVIDER_INFO(XTS_AES_256_enc_ce);
+	}
+	return PROVIDER_BASIC(XTS_AES_256_enc);
+}
+
+DEFINE_INTERFACE_DISPATCHER(XTS_AES_256_dec)
+{
+	if (is_crypto_available()) {
+		return PROVIDER_INFO(XTS_AES_256_dec_ce);
+	}
+	return PROVIDER_BASIC(XTS_AES_256_dec);
+}
+
+DEFINE_INTERFACE_DISPATCHER(XTS_AES_256_enc_expanded_key)
+{
+	if (is_crypto_available()) {
+		return PROVIDER_INFO(XTS_AES_256_enc_expanded_key_ce);
+	}
+	return PROVIDER_BASIC(XTS_AES_256_enc_expanded_key);
+}
+
+DEFINE_INTERFACE_DISPATCHER(XTS_AES_256_dec_expanded_key)
+{
+	if (is_crypto_available()) {
+		return PROVIDER_INFO(XTS_AES_256_dec_expanded_key_ce);
+	}
+	return PROVIDER_BASIC(XTS_AES_256_dec_expanded_key);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_common.S
new file mode 100644
index 000000000..318c1e8a4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_common.S
@@ -0,0 +1,214 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+.altmacro
+.macro aes_key_expand_next next:req,prev:req,ctx:req
+	.if \next == 9
+		mov \ctx, 0x1b
+	.endif
+	dup	vdest.4s,vKey\prev\().s[3]
+	ext	vtmp.16b,vzero.16b,vKey\prev\().16b,#12
+	aese	vdest.16b,vzero.16b
+	eor	vKey\next\().16b,vKey\prev\().16b,vtmp.16b
+	ext	vtmp.16b,vzero.16b,vtmp.16b,#12
+	eor	vKey\next\().16b,vKey\next\().16b,vtmp.16b
+	ext	vtmp.16b,vzero.16b,vtmp.16b,#12
+	mov	tmpw,vdest.s[0]
+	eor	tmpw,\ctx,tmpw,ror 8
+	dup	vdest.4s,tmpw
+	eor	vKey\next\().16b,vKey\next\().16b,vtmp.16b
+	mov	\ctx,ctx,lsl 1
+	eor	vKey\next\().16b,vKey\next\().16b,vdest.16b
+.endm
+
+/* when loadin key = 0
+ *     arg1 = input key
+ *     arg2 = rcon ctx register (optional)
+ * when loading key > 0
+ *     arg1 = rcon ctx register  (optional)
+ */
+.macro aes_key_expand key:req,arg1,arg2
+	.if \key == 0
+		ld1	{vKey\key\().4s},[\arg1]
+		movi vzero.4s, 0
+		.ifb \arg2
+			mov	rcon,#0x01
+		.endif
+
+		.ifnb \arg2
+			mov	\arg2,#0x01
+		.endif
+	.endif
+
+	.if \key > 0
+		prev=\key-1
+		.ifb \arg1
+			aes_key_expand_next	\key,%prev,rcon
+		.endif
+
+		.ifnb \arg1
+			aes_key_expand_next	\key,%prev,\arg1
+		.endif
+	.endif
+.endm
+
+.macro aes_round block:req,key:req,mode:req
+	.if \key < 9
+		.if mode == 0
+			aese	\block\().16b,vKey\key\().16b
+			aesmc	\block\().16b,\block\().16b
+		.else
+			aesd	\block\().16b,vKey\key\().16b
+			aesimc	\block\().16b,\block\().16b
+		.endif
+	.endif
+	.if \key == 9
+		.if mode == 0
+			aese	\block\().16b,vKey\key\().16b
+		.else
+			aesd	\block\().16b,vKey\key\().16b
+		.endif
+	.endif
+	.if \key == 10
+		eor	\block\().16b,\block\().16b,vKey\key\().16b
+	.endif
+.endm
+
+.macro aes_round_interleave b0:req,b1:req,b2:req,b3:req,key:req,mode:req,last_key
+	.if \key < 9
+		.if \mode == 0
+			aese	\b0\().16b,vKey\key\().16b
+			aesmc	\b0\().16b,\b0\().16b
+			aese	\b1\().16b,vKey\key\().16b
+			aesmc	\b1\().16b,\b1\().16b
+			aese	\b2\().16b,vKey\key\().16b
+			aesmc	\b2\().16b,\b2\().16b
+			aese	\b3\().16b,vKey\key\().16b
+			aesmc	\b3\().16b,\b3\().16b
+		.else
+			aesd	\b0\().16b,vKey\key\().16b
+			aesimc	\b0\().16b,\b0\().16b
+			aesd	\b1\().16b,vKey\key\().16b
+			aesimc	\b1\().16b,\b1\().16b
+			aesd	\b2\().16b,vKey\key\().16b
+			aesimc	\b2\().16b,\b2\().16b
+			aesd	\b3\().16b,vKey\key\().16b
+			aesimc	\b3\().16b,\b3\().16b
+		.endif
+	.endif
+
+	.if \key == 9
+		.if \mode == 0
+			aese	\b0\().16b,vKey\key\().16b
+			eor	\b0\().16b,\b0\().16b,vKey\last_key\().16b
+			aese	\b1\().16b,vKey\key\().16b
+			eor	\b1\().16b,\b1\().16b,vKey\last_key\().16b
+			aese	\b2\().16b,vKey\key\().16b
+			eor	\b2\().16b,\b2\().16b,vKey\last_key\().16b
+			aese	\b3\().16b,vKey\key\().16b
+			eor	\b3\().16b,\b3\().16b,vKey\last_key\().16b
+		.else
+			aesd	\b0\().16b,vKey\key\().16b
+			eor	\b0\().16b,\b0\().16b,vKey\last_key\().16b
+			aesd	\b1\().16b,vKey\key\().16b
+			eor	\b1\().16b,\b1\().16b,vKey\last_key\().16b
+			aesd	\b2\().16b,vKey\key\().16b
+			eor	\b2\().16b,\b2\().16b,vKey\last_key\().16b
+			aesd	\b3\().16b,vKey\key\().16b
+			eor	\b3\().16b,\b3\().16b,vKey\last_key\().16b
+		.endif
+	.endif
+.endm
+
+.macro aes_rounds_interleave b0:req,b1:req,b2:req,b3:req,mode
+	aes_round_interleave \b0,\b1,\b2,\b3,0,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,1,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,2,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,3,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,4,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,5,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,6,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,7,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,8,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,9,\mode,10
+.endm
+
+.macro aes_rounds blk:req,mode:req
+	aes_round	\blk,0,\mode
+	aes_round	\blk,1,\mode
+	aes_round	\blk,2,\mode
+	aes_round	\blk,3,\mode
+	aes_round	\blk,4,\mode
+	aes_round	\blk,5,\mode
+	aes_round	\blk,6,\mode
+	aes_round	\blk,7,\mode
+	aes_round	\blk,8,\mode
+	aes_round	\blk,9,\mode
+	aes_round	\blk,10,\mode
+.endm
+
+/* load k1/k2 from memory and encrypt the tweak by k2
+ * boths keys will share the same set of registers
+ * but will never overlap (k2 is used only once and discarded)
+ */
+.macro keyload_and_encrypt_tweak iv:req,k2:req,k1:req
+	ldp	qKey0,qKey1,[\k2],#32
+	aes_enc_round	\iv,0
+	ldp	qKey2,qKey3,[\k2],#32
+	aes_enc_round	\iv,1
+	ldp	qKey0,qKey1,[\k1],#32
+	aes_enc_round	\iv,2
+	ldp	qKey4,qKey5,[\k2],#32
+	aes_enc_round	\iv,3
+	ldp	qKey2,qKey3,[\k1],#32
+	aes_enc_round	\iv,4
+	ldp	qKey6,qKey7,[\k2],#32
+	aes_enc_round	\iv,5
+	ldp	qKey4,qKey5,[\k1],#32
+	aes_enc_round	\iv,6
+	ldp	qKey8,qKey9,[k2],#32
+	aes_enc_round	\iv,7
+	ldp	qKey6,qKey7,[\k1],#32
+	aes_enc_round	\iv,8
+	ld1	{vKey10.16b},[\k2],#16
+	aes_enc_round	\iv,9
+	ldp	qKey8,qKey9,[\k1],#32
+	aes_enc_round	\iv,10
+	ld1	{vKey10.16b},[\k1],#16
+.endm
+
+.macro save_stack
+	stp	d8,d9,[sp, -32]!
+	add	tmpbuf,sp,16
+.endm
+
+.macro restore_stack
+	ldp	d8,d9,[sp],32
+.endm
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_dec.S
new file mode 100644
index 000000000..ceae2d3c0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_dec.S
@@ -0,0 +1,116 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+
+#include "xts_aes_128_common.S"
+#include "xts_aes_common.S"
+
+.macro vswap vec1:req,vec2:req
+	mov	vtmp.16b,\vec1\().16b
+	mov	\vec1\().16b,\vec2\().16b
+	mov	\vec2\().16b,vtmp.16b
+.endm
+
+/* encrypt the tweak by tweak key (k2), and at the same time
+ * to expand encryption key (k1)
+ * even though two sets of keys share the same set of registers
+ * they never overlap at any given time (k2 is used only once and discarded)
+ */
+.macro keyexp_and_encrypt_tweak iv:req,k2:req,k1:req
+	aes_key_expand	0,\k2
+	aes_enc_round	\iv,0
+	aes_key_expand	1
+	aes_enc_round	\iv,1
+	aes_key_expand	0,\k1,rcon2
+	aes_key_expand	2
+	aes_enc_round	\iv,2
+	aes_key_expand	1,rcon2
+	aes_key_expand	3
+	aes_enc_round	\iv,3
+	aes_key_expand	2,rcon2
+	aes_key_expand	4
+	aes_enc_round	\iv,4
+	aes_key_expand	3,rcon2
+	aes_key_expand	5
+	aes_enc_round	\iv,5
+	aes_key_expand	4,rcon2
+	aes_key_expand	6
+	aes_enc_round	\iv,6
+	aes_key_expand	5,rcon2
+	aes_key_expand	7
+	aes_enc_round	\iv,7
+	aes_key_expand	6,rcon2
+	aes_key_expand	8
+	aes_enc_round	\iv,8
+	aes_key_expand	7,rcon2
+	aes_key_expand	9
+	aes_enc_round	\iv,9
+	aes_key_expand	8,rcon2
+	aes_key_expand	10
+	aes_enc_round	\iv,10
+	aes_key_expand	9,rcon2
+	aes_key_expand	10,rcon2
+
+	// transform encryption key into decrption key
+	aesimc	 vKey1.16b,vKey1.16b
+	vswap	vKey0,vKey10
+	aesimc	vKey9.16b,vKey9.16b
+
+	aesimc	vKey2.16b,vKey2.16b
+	aesimc	vKey8.16b,vKey8.16b
+	vswap	vKey1,vKey9
+
+	aesimc	vKey3.16b,vKey3.16b
+	aesimc	vKey7.16b,vKey7.16b
+	vswap	vKey2,vKey8
+
+	aesimc	vKey4.16b,vKey4.16b
+	aesimc	vKey6.16b,vKey6.16b
+	vswap	vKey3,vKey7
+
+	aesimc	vKey5.16b,vKey5.16b
+	vswap	vKey4,vKey6
+.endm
+
+/*
+ * void XTS_AES_128_dec_ce(
+ *   uint8_t *k2,    //!<  key used for tweaking, 16 bytes
+ *   uint8_t *k1,    //!<  key used for decryption of tweaked ciphertext, 16 bytes
+ *   uint8_t *TW_initial,    //!<  initial tweak value, 16 bytes
+ *   uint64_t N, //!<  sector size, in bytes
+ *   const uint8_t *ct,  //!<  ciphertext sector input data
+ *   uint8_t *pt //!<  plaintext sector output data
+ *   );
+*/
+	.global XTS_AES_128_dec_ce
+        .type XTS_AES_128_dec_ce, %function
+XTS_AES_128_dec_ce:
+	xts_aes_crypt 1,keyexp_and_encrypt_tweak vIV0,key2,key1
+	.size   XTS_AES_128_dec_ce, .-XTS_AES_128_dec_ce
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_enc.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_enc.S
new file mode 100644
index 000000000..23ed14a38
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_enc.S
@@ -0,0 +1,91 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+
+#include "xts_aes_128_common.S"
+#include "xts_aes_common.S"
+
+/* encrypt the tweak by tweak key (k2), and at the same time
+ * to expand encryption key (k1)
+ * even though two sets of keys share the same set of registers
+ * they never overlap at any given time (k2 is used once and discarded)
+ */
+.macro keyexp_and_encrypt_tweak iv:req,k2:req,k1:req
+	aes_key_expand	0,\k2
+	aes_enc_round	\iv,0
+	aes_key_expand	1
+	aes_enc_round	\iv,1
+	aes_key_expand	0,\k1,rcon2
+	aes_key_expand	2
+	aes_enc_round	\iv,2
+	aes_key_expand	1,rcon2
+	aes_key_expand	3
+	aes_enc_round	\iv,3
+	aes_key_expand	2,rcon2
+	aes_key_expand	4
+	aes_enc_round	\iv,4
+	aes_key_expand	3,rcon2
+	aes_key_expand	5
+	aes_enc_round	\iv,5
+	aes_key_expand	4,rcon2
+	aes_key_expand	6
+	aes_enc_round	\iv,6
+	aes_key_expand	5,rcon2
+	aes_key_expand	7
+	aes_enc_round	\iv,7
+	aes_key_expand	6,rcon2
+	aes_key_expand	8
+	aes_enc_round	\iv,8
+	aes_key_expand	7,rcon2
+	aes_key_expand	9
+	aes_enc_round	\iv,9
+	aes_key_expand	8,rcon2
+	aes_key_expand	10
+	aes_enc_round	\iv,10
+	aes_key_expand	9,rcon2
+	aes_key_expand	10,rcon2
+.endm
+
+
+/*
+ * void XTS_AES_128_enc_ce(
+ *   uint8_t *k2,    //!<  key used for tweaking, 16 bytes
+ *   uint8_t *k1,    //!<  key used for decryption of tweaked ciphertext, 16 bytes
+ *   uint8_t *TW_initial,    //!<  initial tweak value, 16 bytes
+ *   uint64_t N, //!<  sector size, in bytes
+ *   const uint8_t *pt,  //!<  cleartext sector input data
+ *   uint8_t *ct //!<  ciphertext sector output data
+ *   );
+ */
+	.global XTS_AES_128_enc_ce
+        .type XTS_AES_128_enc_ce, %function
+XTS_AES_128_enc_ce:
+	xts_aes_crypt 0,keyexp_and_encrypt_tweak vIV0,key2,key1
+	.size   XTS_AES_128_enc_ce, .-XTS_AES_128_enc_ce
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_common.S
new file mode 100644
index 000000000..e6535dba3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_common.S
@@ -0,0 +1,247 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+.altmacro
+.macro aes_key_expand_next out0:req,out1:req,in0:req,in1:req,ctx:req
+	dup	vdest.4s,vKey\in1\().s[3]
+	ext	vtmp.16b,vzero.16b,vKey\in0\().16b,#12
+	aese	vdest.16b,vzero.16b
+	eor	vKey\out0\().16b,vKey\in0\().16b,vtmp.16b
+	ext	vtmp.16b,vzero.16b,vtmp.16b,#12
+	eor	vKey\out0\().16b,vKey\out0\().16b,vtmp.16b
+	ext	vtmp.16b,vzero.16b,vtmp.16b,#12
+	mov	tmpw,vdest.s[0]
+	eor	tmpw,\ctx,tmpw,ror 8
+	dup	vdest.4s,tmpw
+	eor	vKey\out0\().16b,vKey\out0\().16b,vtmp.16b
+	mov	\ctx,ctx,lsl 1
+	eor	vKey\out0\().16b,vKey\out0\().16b,vdest.16b
+
+	.if \out1 < 14
+		dup vdest.4s, vKey\out0\().s[3]
+		ext	vtmp.16b, vzero.16b,vKey\in1\().16b,#12
+		aese	vdest.16b,vzero.16b
+		eor	vKey\out1\().16b,vKey\in1\().16b,vtmp.16b
+		ext	vtmp.16b,vzero.16b,vtmp.16b,#12
+		eor	vKey\out1\().16b,vKey\out1\().16b,vtmp.16b
+		ext	vtmp.16b,vzero.16b,vtmp.16b,#12
+		eor	vKey\out1\().16b,vKey\out1\().16b,vtmp.16b
+		eor	vKey\out1\().16b,vKey\out1\().16b,vdest.16b
+	.endif
+.endm
+
+/* when loadin key = 0
+ *     arg1 = input key
+ *     arg2 = rcon ctx register (optional)
+ * when loading key > 0
+ *     arg1 = rcon ctx register  (optional)
+ */
+.macro aes_key_expand key:req,arg1,arg2
+	.if \key == 0
+		ld1	{vKey0.4s,vKey1.4s},[\arg1]
+		movi vzero.4s, 0
+		.ifb \arg2
+			mov	rcon,#0x01
+		.endif
+
+		.ifnb \arg2
+			mov	\arg2,#0x01
+		.endif
+	.endif
+
+	.if \key > 0
+		in0=\key-2
+		in1=\key-1
+		out0=\key
+		out1=\key+1
+		.ifb \arg1
+			aes_key_expand_next	%out0,%out1,%in0,%in1,rcon
+		.endif
+
+		.ifnb \arg1
+			aes_key_expand_next	%out0,%out1,%in0,%in1,\arg1
+		.endif
+	.endif
+.endm
+
+.macro aes_round block:req,key:req,mode:req
+	.if \key < 13
+		.if mode == 0
+			aese	\block\().16b,vKey\key\().16b
+			aesmc	\block\().16b,\block\().16b
+		.else
+			aesd	\block\().16b,vKey\key\().16b
+			aesimc	\block\().16b,\block\().16b
+		.endif
+	.endif
+	.if \key == 13
+		.if mode == 0
+			aese	\block\().16b,vKey\key\().16b
+		.else
+			aesd	\block\().16b,vKey\key\().16b
+		.endif
+	.endif
+	.if \key == 14
+		eor	\block\().16b,\block\().16b,vKey\key\().16b
+	.endif
+.endm
+
+.macro aes_round_interleave b0:req,b1:req,b2:req,b3:req,key:req,mode:req,last_key
+	.if \key < 13
+		.if \mode == 0
+			aese	\b0\().16b,vKey\key\().16b
+			aesmc	\b0\().16b,\b0\().16b
+			aese	\b1\().16b,vKey\key\().16b
+			aesmc	\b1\().16b,\b1\().16b
+			aese	\b2\().16b,vKey\key\().16b
+			aesmc	\b2\().16b,\b2\().16b
+			aese	\b3\().16b,vKey\key\().16b
+			aesmc	\b3\().16b,\b3\().16b
+		.else
+			aesd	\b0\().16b,vKey\key\().16b
+			aesimc	\b0\().16b,\b0\().16b
+			aesd	\b1\().16b,vKey\key\().16b
+			aesimc	\b1\().16b,\b1\().16b
+			aesd	\b2\().16b,vKey\key\().16b
+			aesimc	\b2\().16b,\b2\().16b
+			aesd	\b3\().16b,vKey\key\().16b
+			aesimc	\b3\().16b,\b3\().16b
+		.endif
+	.endif
+
+	.if \key == 13
+		.if \mode == 0
+			aese	\b0\().16b,vKey\key\().16b
+			eor	\b0\().16b,\b0\().16b,vKey\last_key\().16b
+			aese	\b1\().16b,vKey\key\().16b
+			eor	\b1\().16b,\b1\().16b,vKey\last_key\().16b
+			aese	\b2\().16b,vKey\key\().16b
+			eor	\b2\().16b,\b2\().16b,vKey\last_key\().16b
+			aese	\b3\().16b,vKey\key\().16b
+			eor	\b3\().16b,\b3\().16b,vKey\last_key\().16b
+		.else
+			aesd	\b0\().16b,vKey\key\().16b
+			eor	\b0\().16b,\b0\().16b,vKey\last_key\().16b
+			aesd	\b1\().16b,vKey\key\().16b
+			eor	\b1\().16b,\b1\().16b,vKey\last_key\().16b
+			aesd	\b2\().16b,vKey\key\().16b
+			eor	\b2\().16b,\b2\().16b,vKey\last_key\().16b
+			aesd	\b3\().16b,vKey\key\().16b
+			eor	\b3\().16b,\b3\().16b,vKey\last_key\().16b
+		.endif
+	.endif
+.endm
+
+
+
+.macro aes_rounds_interleave b0:req,b1:req,b2:req,b3:req,mode
+	aes_round_interleave \b0,\b1,\b2,\b3,0,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,1,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,2,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,3,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,4,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,5,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,6,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,7,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,8,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,9,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,10,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,11,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,12,\mode
+	aes_round_interleave \b0,\b1,\b2,\b3,13,\mode,14
+.endm
+
+
+.macro aes_rounds blk:req,mode:req
+	aes_round	\blk,0,\mode
+	aes_round	\blk,1,\mode
+	aes_round	\blk,2,\mode
+	aes_round	\blk,3,\mode
+	aes_round	\blk,4,\mode
+	aes_round	\blk,5,\mode
+	aes_round	\blk,6,\mode
+	aes_round	\blk,7,\mode
+	aes_round	\blk,8,\mode
+	aes_round	\blk,9,\mode
+	aes_round	\blk,10,\mode
+	aes_round	\blk,11,\mode
+	aes_round	\blk,12,\mode
+	aes_round	\blk,13,\mode
+	aes_round	\blk,14,\mode
+.endm
+
+/* load k1/k2 from memory and encrypt the tweak by k2
+ * boths keys will share the same set of registers
+ * but will never overlap (k2 is used only once and discarded)
+ */
+.macro keyload_and_encrypt_tweak iv:req,k2:req,k1:req
+	ldp	qKey0,qKey1,[\k2],#32
+	aes_enc_round	\iv,0
+	ldp	qKey2,qKey3,[\k2],#32
+	aes_enc_round	\iv,1
+	ldp	qKey0,qKey1,[\k1],#32
+	aes_enc_round	\iv,2
+	ldp	qKey4,qKey5,[\k2],#32
+	aes_enc_round	\iv,3
+	ldp	qKey2,qKey3,[\k1],#32
+	aes_enc_round	\iv,4
+	ldp	qKey6,qKey7,[\k2],#32
+	aes_enc_round	\iv,5
+	ldp	qKey4,qKey5,[\k1],#32
+	aes_enc_round	\iv,6
+	ldp	qKey8,qKey9,[k2],#32
+	aes_enc_round	\iv,7
+	ldp	qKey6,qKey7,[\k1],#32
+	aes_enc_round	\iv,8
+	ldp	qKey10,qKey11,[k2],#32
+	aes_enc_round	\iv,9
+	ldp	qKey8,qKey9,[\k1],#32
+	aes_enc_round	\iv,10
+	ldp	qKey12,qKey13,[k2],#32
+	aes_enc_round	\iv,11
+	ldp	qKey10,qKey11,[\k1],#32
+	aes_enc_round	\iv,12
+	ld1	{vKey14.16b},[k2],#16
+	aes_enc_round	\iv,13
+	ldp	qKey12,qKey13,[\k1],#32
+	aes_enc_round	\iv,14
+	ld1	{vKey14.16b},[\k1],#16
+.endm
+
+.macro save_stack
+	stp	d8,d9,[sp, -48]!
+	stp	d10,d11,[sp, 16]
+	add	tmpbuf,sp,32
+.endm
+
+.macro restore_stack
+	ldp	d10,d11,[sp, 16]
+	ldp	d8,d9,[sp], 48
+.endm
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_dec.S
new file mode 100644
index 000000000..aa46ded08
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_dec.S
@@ -0,0 +1,116 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+
+#include "xts_aes_256_common.S"
+#include "xts_aes_common.S"
+
+.macro vswap vec1:req,vec2:req
+	mov	vtmp.16b,\vec1\().16b
+	mov	\vec1\().16b,\vec2\().16b
+	mov	\vec2\().16b,vtmp.16b
+.endm
+
+/* encrypt the tweak by tweak key (k2), and at the same time
+ * to expand encryption key (k1)
+ * even though two sets of keys share the same set of registers
+ * they never overlap at any given time (k2 is used only once and discarded)
+ */
+.macro keyexp_and_encrypt_tweak iv:req,k2:req,k1:req
+	aes_key_expand	0,\k2
+	aes_enc_round	\iv,0
+	aes_enc_round	\iv,1
+	aes_key_expand	2
+	aes_key_expand	0,\k1,rcon2
+	aes_enc_round	\iv,2
+	aes_enc_round	\iv,3
+	aes_key_expand	4
+	aes_key_expand	2,rcon2
+	aes_enc_round	\iv,4
+	aes_enc_round	\iv,5
+	aes_key_expand	6
+	aes_key_expand	4,rcon2
+	aes_enc_round	\iv,6
+	aes_enc_round	\iv,7
+	aes_key_expand	8
+	aes_key_expand	6,rcon2
+	aes_enc_round	\iv,8
+	aes_enc_round	\iv,9
+	aes_key_expand	10
+	aes_key_expand	8,rcon2
+	aes_enc_round	\iv,10
+	aes_enc_round	\iv,11
+	aes_key_expand	12
+	aes_key_expand	10,rcon2
+	aes_enc_round	\iv,12
+	aes_enc_round	\iv,13
+	aes_key_expand	14
+	aes_key_expand	12,rcon2
+	aes_enc_round	\iv,14
+	aes_key_expand	14,rcon2
+
+	// transform encryption key into decrption key
+	aesimc	vKey1.16b,vKey1.16b
+	vswap	vKey0,vKey14
+	aesimc	vKey13.16b,vKey13.16b
+	aesimc	vKey2.16b,vKey2.16b
+	vswap	vKey1,vKey13
+	aesimc	vKey12.16b,vKey12.16b
+	aesimc	vKey3.16b,vKey3.16b
+	vswap	vKey2,vKey12
+	aesimc	vKey11.16b,vKey11.16b
+	aesimc	vKey4.16b,vKey4.16b
+	vswap	vKey3,vKey11
+	aesimc	vKey10.16b,vKey10.16b
+	aesimc	vKey5.16b,vKey5.16b
+	vswap	vKey4,vKey10
+	aesimc	vKey9.16b,vKey9.16b
+	aesimc	vKey6.16b,vKey6.16b
+	vswap	vKey5,vKey9
+	aesimc	vKey8.16b,vKey8.16b
+	aesimc	vKey7.16b,vKey7.16b
+	vswap	vKey6,vKey8
+.endm
+
+/*
+ * void XTS_AES_256_dec_ce(
+ *   uint8_t *k2,    //!<  key used for tweaking, 32 bytes
+ *   uint8_t *k1,    //!<  key used for decryption of tweaked ciphertext, 32 bytes
+ *   uint8_t *TW_initial,    //!<  initial tweak value, 16 bytes
+ *   uint64_t N, //!<  sector size, in bytes
+ *   const uint8_t *ct,  //!<  ciphertext sector input data
+ *   uint8_t *pt //!<  plaintext sector output data
+ *   );
+*/
+	.global XTS_AES_256_dec_ce
+        .type XTS_AES_256_dec_ce, %function
+XTS_AES_256_dec_ce:
+	xts_aes_crypt 1,keyexp_and_encrypt_tweak vIV0,key2,key1
+	.size   XTS_AES_256_dec_ce, .-XTS_AES_256_dec_ce
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_enc.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_enc.S
new file mode 100644
index 000000000..8e4088a4d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_enc.S
@@ -0,0 +1,88 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+
+#include "xts_aes_256_common.S"
+#include "xts_aes_common.S"
+
+/* encrypt the tweak by tweak key (k2), and at the same time
+ * to expand encryption key (k1)
+ * even though two sets of keys share the same set of registers
+ * they never overlap at any given time (k2 is used once and discarded)
+ */
+.macro keyexp_and_encrypt_tweak iv:req,k2:req,k1:req
+	aes_key_expand	0,\k2
+	aes_enc_round	\iv,0
+	aes_enc_round	\iv,1
+	aes_key_expand	2
+	aes_key_expand	0,\k1,rcon2
+	aes_enc_round	\iv,2
+	aes_enc_round	\iv,3
+	aes_key_expand	4
+	aes_key_expand	2,rcon2
+	aes_enc_round	\iv,4
+	aes_enc_round	\iv,5
+	aes_key_expand	6
+	aes_key_expand	4,rcon2
+	aes_enc_round	\iv,6
+	aes_enc_round	\iv,7
+	aes_key_expand	8
+	aes_key_expand	6,rcon2
+	aes_enc_round	\iv,8
+	aes_enc_round	\iv,9
+	aes_key_expand	10
+	aes_key_expand	8,rcon2
+	aes_enc_round	\iv,10
+	aes_enc_round	\iv,11
+	aes_key_expand	12
+	aes_key_expand	10,rcon2
+	aes_enc_round	\iv,12
+	aes_enc_round	\iv,13
+	aes_key_expand	14
+	aes_key_expand	12,rcon2
+	aes_enc_round	\iv,14
+	aes_key_expand	14,rcon2
+.endm
+
+/*
+ * void XTS_AES_256_enc_ce(
+ *   uint8_t *k2,    //!<  key used for tweaking, 16 bytes
+ *   uint8_t *k1,    //!<  key used for decryption of tweaked ciphertext, 16 bytes
+ *   uint8_t *TW_initial,    //!<  initial tweak value, 16 bytes
+ *   uint64_t N, //!<  sector size, in bytes
+ *   const uint8_t *pt,  //!<  cleartext sector input data
+ *   uint8_t *ct //!<  ciphertext sector output data
+ *   );
+ */
+	.global XTS_AES_256_enc_ce
+        .type XTS_AES_256_enc_ce, %function
+XTS_AES_256_enc_ce:
+	xts_aes_crypt 0,keyexp_and_encrypt_tweak vIV0,key2,key1
+	.size   XTS_AES_256_enc_ce, .-XTS_AES_256_enc_ce
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_common.S
new file mode 100644
index 000000000..c32a13820
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_common.S
@@ -0,0 +1,232 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+.macro  declare_var_vector_reg name:req,reg:req
+.ifdef q\name
+	.unreq	q\name
+	.unreq	v\name
+	.unreq	s\name
+	.unreq	d\name
+.endif
+	.set q\name , \reg
+	q\name	.req	q\reg
+	v\name	.req	v\reg
+	s\name	.req	s\reg
+	d\name	.req	d\reg
+.endm
+
+.macro  declare_var_generic_reg name:req,reg:req
+	\name	.req	x\reg
+	x\name	.req	x\reg
+	w\name	.req	w\reg
+.endm
+
+	declare_var_vector_reg	zero ,0
+	declare_var_vector_reg	tmp,1
+	declare_var_vector_reg	mask,2
+	declare_var_vector_reg	dest,3
+	declare_var_vector_reg	blk0,4
+	declare_var_vector_reg	blk1,5
+	declare_var_vector_reg	blk2,6
+	declare_var_vector_reg	blk3,7
+	declare_var_vector_reg	Key11,8
+	declare_var_vector_reg	Key12,9
+	declare_var_vector_reg	Key13,10
+	declare_var_vector_reg	Key14,11
+	declare_var_vector_reg	SavedIv,16
+	declare_var_vector_reg	IV0,17
+	declare_var_vector_reg	IV1,18
+	declare_var_vector_reg	IV2,19
+	declare_var_vector_reg	IV3,20
+	declare_var_vector_reg	Key0,21
+	declare_var_vector_reg	Key1,22
+	declare_var_vector_reg	Key2,23
+	declare_var_vector_reg	Key3,24
+	declare_var_vector_reg	Key4,25
+	declare_var_vector_reg	Key5,26
+	declare_var_vector_reg	Key6,27
+	declare_var_vector_reg	Key7,28
+	declare_var_vector_reg	Key8,29
+	declare_var_vector_reg	Key9,30
+	declare_var_vector_reg	Key10,31
+
+.macro aes_enc_round block:req,key:req
+	aes_round	\block,\key,0
+.endm
+
+.macro aes_dec_round block:req,key:req
+	aes_round	\block,\key,1
+.endm
+
+.macro update_iv current:req,next:req
+	mov	ivh,\current\().d[1]
+	mov	ivl,\current\().d[0]
+	mov	tmpw,#0x87
+	extr	tmpx2,ivh,ivh,#32
+	extr	ivh,ivh,ivl,#63
+	and	tmpw,tmpw,tmpw2,asr#31
+	eor	ivl,tmpx,ivl,lsl#1
+	mov	\next\().d[1],ivh
+	mov	\next\().d[0],ivl
+.endm
+
+.macro process_4_blks inp:req,outp:req,mode:req,is_tail
+	update_iv	vIV0,vIV1
+	update_iv	vIV1,vIV2
+	ldp	qblk0,qblk1,[\inp],#32
+	ldp	qblk2,qblk3,[\inp],#32
+	.ifnb	\is_tail
+		update_iv	vIV2, vSavedIv
+		update_iv	vSavedIv,vIV3
+	.else
+		update_iv	vIV2,vIV3
+	.endif
+	eor	vblk0.16b,vblk0.16b,vIV0.16b
+	eor	vblk1.16b,vblk1.16b,vIV1.16b
+	eor	vblk2.16b,vblk2.16b,vIV2.16b
+	eor	vblk3.16b,vblk3.16b,vIV3.16b
+
+	aes_rounds_interleave vblk0,vblk1,vblk2,vblk3,\mode
+	eor	vblk0.16b,vblk0.16b,vIV0.16b
+	eor	vblk1.16b,vblk1.16b,vIV1.16b
+	stp	qblk0,qblk1,[\outp],#32
+	eor	vblk2.16b,vblk2.16b,vIV2.16b
+	eor	vblk3.16b,vblk3.16b,vIV3.16b
+	stp	qblk2,qblk3,[\outp],#32
+	.ifb \is_tail
+		update_iv	vIV3,vIV0
+	.endif
+.endm
+
+.macro process_1_blk inp:req,outp:req,mode:req
+	ld1	{vblk0.16b},[\inp],#16
+	eor	vblk0.16b,vblk0.16b,vIV0.16b
+	aes_rounds	vblk0,\mode
+	eor	vblk0.16b,vblk0.16b,vIV0.16b
+	str	qblk0,[\outp], #16
+.endm
+
+	key2	.req	x0
+	key1	.req	x1
+	iv	.req	x2
+	bytes	.req	x3
+	inp	.req	x4
+	outp	.req	x5
+	rcon	.req	w6
+	blocks	.req	x7
+	tmpx	.req	x8
+	tmpw	.req	w8
+	tmpw2	.req	w9
+	tmpx2	.req	x9
+	ivl	.req	x10
+	ivh	.req	x11
+	lastblk	.req	x12
+	tmpbuf	.req	x13
+	tailcnt	.req	x14
+	rcon2	.req	w15
+
+.macro xts_aes_crypt mode:req,expander,more:vararg
+	save_stack
+
+	ld1	{vIV0.16b},[iv],16
+	.ifnb \expander
+		\expander\()	\more
+	.endif
+	lsr	blocks,bytes,4
+	and	tailcnt,bytes,#0x0F
+
+	cmp	bytes,16
+	b.lt	.return
+
+.process_4_blks:
+	cmp	blocks, 4
+	b.lt	.singles
+	subs	blocks,blocks,4
+	/* in decryption mode, check whether this is
+	 * last block before the less-than-one-block tail
+	 * need to swap tweak in this case
+	 */
+	.if \mode == 1
+		b.gt	.not_tail_4blk
+		cmp	tailcnt,1
+		b.lt	.not_tail_4blk
+		process_4_blks	inp,outp,\mode,1
+		b	.process_4_blks
+.not_tail_4blk:
+	.endif
+	process_4_blks	inp,outp,\mode
+	b	.process_4_blks
+
+.singles:
+	subs	blocks,blocks,#1
+	b.lt	.checktail
+	/* in decryption mode, check whether this is
+	 *last block before the less-than-one-block tail
+	 * need to swap tweak in this case
+	 */
+	.if \mode == 1
+		b.gt	.not_tail_1blk
+		cmp	tailcnt,1
+		b.lt	.not_tail_1blk
+		mov	vSavedIv.16b, vIV0.16b
+		update_iv	vSavedIv, vIV0
+		process_1_blk	inp,outp,\mode
+		b	.checktail
+.not_tail_1blk:
+	.endif
+	process_1_blk	inp,outp,\mode
+	update_iv	vIV0,vIV0
+	b .singles
+.checktail:
+	cmp	tailcnt,1
+	b.lt	.return
+	sub	lastblk,outp,#16
+.copytail:
+	subs	tailcnt,tailcnt,#1
+	ldrb	tmpw,[lastblk,tailcnt]
+	strb	tmpw,[outp,tailcnt]
+	ldrb	tmpw,[inp,tailcnt]
+	strb	tmpw,[tmpbuf,tailcnt]
+	b.gt	.copytail
+	and	tailcnt,bytes,#0x0F
+.steal:
+	cmp	tailcnt,15
+	ldrb	tmpw,[lastblk,tailcnt]
+	strb	tmpw,[tmpbuf,tailcnt]
+	add	tailcnt,tailcnt,#1
+	b.lt	.steal
+	.if \mode == 1
+		mov	vIV0.16b,vSavedIv.16b
+	.endif
+	process_1_blk	tmpbuf,lastblk,\mode
+.return:
+	restore_stack
+	ret
+.endm
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_dec.S
new file mode 100644
index 000000000..9549ebfa0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_dec.S
@@ -0,0 +1,49 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+
+#include "xts_aes_128_common.S"
+#include "xts_aes_common.S"
+
+/*
+ * void XTS_AES_128_dec_expanded_key_ce(
+ *  uint8_t *k2,    //!<  expanded key used for tweaking, 16*11 bytes - encryption key is used
+ *  uint8_t *k1,    //!<  expanded decryption key used for decryption of tweaked ciphertext, 16*11 bytes
+ *  uint8_t *TW_initial,    //!<  initial tweak value, 16 bytes
+ *  uint64_t N, //!<  sector size, in bytes
+ *  const uint8_t *ct,  //!<  ciphertext sector input data
+ *  uint8_t *pt //!<  plaintext sector output data
+ *  );
+*/
+	.global XTS_AES_128_dec_expanded_key_ce
+        .type XTS_AES_128_dec_expanded_key_ce, %function
+XTS_AES_128_dec_expanded_key_ce:
+	xts_aes_crypt 1,keyload_and_encrypt_tweak,vIV0,key2,key1
+	.size   XTS_AES_128_dec_expanded_key_ce, .-XTS_AES_128_dec_expanded_key_ce
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_enc.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_enc.S
new file mode 100644
index 000000000..1f2d2db2e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_enc.S
@@ -0,0 +1,49 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+
+#include "xts_aes_128_common.S"
+#include "xts_aes_common.S"
+
+/*
+ * void XTS_AES_128_enc_expanded_key_ce(
+ *  uint8_t *k2,    //!<  expanded key used for tweaking, 16*11 bytes
+ *  uint8_t *k1,    //!<  expanded key used for encryption of tweaked plaintext, 16*11 bytes
+ *  uint8_t *TW_initial,    //!<  initial tweak value, 16 bytes
+ *  uint64_t N, //!<  sector size, in bytes
+ *  const uint8_t *pt,  //!<  plaintext sector input data
+ *  uint8_t *ct //!<  ciphertext sector output data
+ *  );
+ */
+	.global XTS_AES_128_enc_expanded_key_ce
+        .type XTS_AES_128_enc_expanded_key_ce, %function
+XTS_AES_128_enc_expanded_key_ce:
+	xts_aes_crypt 0,keyload_and_encrypt_tweak,vIV0,key2,key1
+	.size   XTS_AES_128_enc_expanded_key_ce, .-XTS_AES_128_enc_expanded_key_ce
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_dec.S
new file mode 100644
index 000000000..95c8bf63d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_dec.S
@@ -0,0 +1,49 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+
+#include "xts_aes_256_common.S"
+#include "xts_aes_common.S"
+
+/*
+ * void XTS_AES_256_dec_expanded_key_ce(
+ *  uint8_t *k2,    //!<  expanded key used for tweaking, 16*15 bytes - encryption key is used
+ *  uint8_t *k1,    //!<  expanded decryption key used for decryption of tweaked ciphertext, 16*15 bytes
+ *  uint8_t *TW_initial,    //!<  initial tweak value, 16 bytes
+ *  uint64_t N, //!<  sector size, in bytes
+ *  const uint8_t *ct,  //!<  ciphertext sector input data
+ *  uint8_t *pt //!<  plaintext sector output data
+ *  );
+*/
+	.global XTS_AES_256_dec_expanded_key_ce
+        .type XTS_AES_256_dec_expanded_key_ce, %function
+XTS_AES_256_dec_expanded_key_ce:
+	xts_aes_crypt 1,keyload_and_encrypt_tweak,vIV0,key2,key1
+	.size   XTS_AES_256_dec_expanded_key_ce, .-XTS_AES_256_dec_expanded_key_ce
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_enc.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_enc.S
new file mode 100644
index 000000000..bd840a994
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_enc.S
@@ -0,0 +1,49 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+
+#include "xts_aes_256_common.S"
+#include "xts_aes_common.S"
+
+/*
+ * void XTS_AES_256_enc_expanded_key_ce(
+ *  uint8_t *k2,    //!<  expanded key used for tweaking, 16*15 bytes
+ *  uint8_t *k1,    //!<  expanded key used for encryption of tweaked plaintext, 16*15 bytes
+ *  uint8_t *TW_initial,    //!<  initial tweak value, 16 bytes
+ *  uint64_t N, //!<  sector size, in bytes
+ *  const uint8_t *pt,  //!<  plaintext sector input data
+ *  uint8_t *ct //!<  ciphertext sector output data
+ *  );
+ */
+	.global XTS_AES_256_enc_expanded_key_ce
+        .type XTS_AES_256_enc_expanded_key_ce, %function
+XTS_AES_256_enc_expanded_key_ce:
+	xts_aes_crypt 0,keyload_and_encrypt_tweak,vIV0,key2,key1
+	.size   XTS_AES_256_enc_expanded_key_ce, .-XTS_AES_256_enc_expanded_key_ce
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_multibinary_aarch64.S
new file mode 100644
index 000000000..af77d885b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_multibinary_aarch64.S
@@ -0,0 +1,39 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aarch64_multibinary.h"
+
+mbin_interface     XTS_AES_128_enc
+mbin_interface     XTS_AES_128_dec
+mbin_interface     XTS_AES_128_enc_expanded_key
+mbin_interface     XTS_AES_128_dec_expanded_key
+mbin_interface     XTS_AES_256_enc
+mbin_interface     XTS_AES_256_dec
+mbin_interface     XTS_AES_256_enc_expanded_key
+mbin_interface     XTS_AES_256_dec_expanded_key
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aes_common.asm b/src/crypto/isa-l/isa-l_crypto/aes/aes_common.asm
new file mode 100644
index 000000000..22f00b395
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aes_common.asm
@@ -0,0 +1,377 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef _AES_COMMON_ASM_
+%define _AES_COMMON_ASM_
+
+%include "reg_sizes.asm"
+
+;; =============================================================================
+;; Generic macro to produce code that executes %%OPCODE instruction
+;; on selected number of AES blocks (16 bytes long ) between 0 and 16.
+;; All three operands of the instruction come from registers.
+;; Note: if 3 blocks are left at the end instruction is produced to operate all
+;;       4 blocks (full width of ZMM)
+
+%macro ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 14
+%define %%NUM_BLOCKS    %1      ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%OPCODE        %2      ; [in] instruction name
+%define %%DST0          %3      ; [out] destination ZMM register
+%define %%DST1          %4      ; [out] destination ZMM register
+%define %%DST2          %5      ; [out] destination ZMM register
+%define %%DST3          %6      ; [out] destination ZMM register
+%define %%SRC1_0        %7      ; [in] source 1 ZMM register
+%define %%SRC1_1        %8      ; [in] source 1 ZMM register
+%define %%SRC1_2        %9      ; [in] source 1 ZMM register
+%define %%SRC1_3        %10     ; [in] source 1 ZMM register
+%define %%SRC2_0        %11     ; [in] source 2 ZMM register
+%define %%SRC2_1        %12     ; [in] source 2 ZMM register
+%define %%SRC2_2        %13     ; [in] source 2 ZMM register
+%define %%SRC2_3        %14     ; [in] source 2 ZMM register
+
+%assign reg_idx     0
+%assign blocks_left %%NUM_BLOCKS
+
+%rep (%%NUM_BLOCKS / 4)
+%xdefine %%DSTREG  %%DST %+ reg_idx
+%xdefine %%SRC1REG %%SRC1_ %+ reg_idx
+%xdefine %%SRC2REG %%SRC2_ %+ reg_idx
+        %%OPCODE        %%DSTREG, %%SRC1REG, %%SRC2REG
+%undef %%DSTREG
+%undef %%SRC1REG
+%undef %%SRC2REG
+%assign reg_idx     (reg_idx + 1)
+%assign blocks_left (blocks_left - 4)
+%endrep
+
+%xdefine %%DSTREG  %%DST %+ reg_idx
+%xdefine %%SRC1REG %%SRC1_ %+ reg_idx
+%xdefine %%SRC2REG %%SRC2_ %+ reg_idx
+
+%if blocks_left == 1
+        %%OPCODE        XWORD(%%DSTREG), XWORD(%%SRC1REG), XWORD(%%SRC2REG)
+%elif blocks_left == 2
+        %%OPCODE        YWORD(%%DSTREG), YWORD(%%SRC1REG), YWORD(%%SRC2REG)
+%elif blocks_left == 3
+        %%OPCODE        %%DSTREG, %%SRC1REG, %%SRC2REG
+%endif
+
+%endmacro
+
+;; =============================================================================
+;; Loads specified number of AES blocks into ZMM registers
+;; %%FLAGS are optional and only affect behavior when 3 trailing blocks are left
+;; - if %%FlAGS not provided then exactly 3 blocks are loaded (move and insert)
+;; - if "load_4_instead_of_3" option is passed then 4 blocks are loaded
+%macro ZMM_LOAD_BLOCKS_0_16 7-8
+%define %%NUM_BLOCKS    %1 ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%INP           %2 ; [in] input data pointer to read from
+%define %%DATA_OFFSET   %3 ; [in] offset to the output pointer (GP or numerical)
+%define %%DST0          %4 ; [out] ZMM register with loaded data
+%define %%DST1          %5 ; [out] ZMM register with loaded data
+%define %%DST2          %6 ; [out] ZMM register with loaded data
+%define %%DST3          %7 ; [out] ZMM register with loaded data
+%define %%FLAGS         %8 ; [in] optional "load_4_instead_of_3"
+
+%assign src_offset  0
+%assign dst_idx     0
+
+%rep (%%NUM_BLOCKS / 4)
+%xdefine %%DSTREG %%DST %+ dst_idx
+        vmovdqu8        %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset]
+%undef %%DSTREG
+%assign src_offset  (src_offset + 64)
+%assign dst_idx     (dst_idx + 1)
+%endrep
+
+%assign blocks_left (%%NUM_BLOCKS % 4)
+%xdefine %%DSTREG %%DST %+ dst_idx
+
+%if blocks_left == 1
+        vmovdqu8        XWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset]
+%elif blocks_left == 2
+        vmovdqu8        YWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset]
+%elif blocks_left == 3
+%ifidn %%FLAGS, load_4_instead_of_3
+        vmovdqu8        %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset]
+%else
+        vmovdqu8        YWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset]
+        vinserti64x2    %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset + 32], 2
+%endif
+%endif
+
+%endmacro
+
+;; =============================================================================
+;; Loads specified number of AES blocks into ZMM registers using mask register
+;; for the last loaded register (xmm, ymm or zmm).
+;; Loads take place at 1 byte granularity.
+%macro ZMM_LOAD_MASKED_BLOCKS_0_16 8
+%define %%NUM_BLOCKS    %1 ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%INP           %2 ; [in] input data pointer to read from
+%define %%DATA_OFFSET   %3 ; [in] offset to the output pointer (GP or numerical)
+%define %%DST0          %4 ; [out] ZMM register with loaded data
+%define %%DST1          %5 ; [out] ZMM register with loaded data
+%define %%DST2          %6 ; [out] ZMM register with loaded data
+%define %%DST3          %7 ; [out] ZMM register with loaded data
+%define %%MASK          %8 ; [in] mask register
+
+%assign src_offset  0
+%assign dst_idx     0
+%assign blocks_left %%NUM_BLOCKS
+
+%if %%NUM_BLOCKS > 0
+%rep (((%%NUM_BLOCKS + 3) / 4) - 1)
+%xdefine %%DSTREG %%DST %+ dst_idx
+        vmovdqu8        %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset]
+%undef %%DSTREG
+%assign src_offset  (src_offset + 64)
+%assign dst_idx     (dst_idx + 1)
+%assign blocks_left (blocks_left - 4)
+%endrep
+%endif  ; %if %%NUM_BLOCKS > 0
+
+%xdefine %%DSTREG %%DST %+ dst_idx
+
+%if blocks_left == 1
+        vmovdqu8        XWORD(%%DSTREG){%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset]
+%elif blocks_left == 2
+        vmovdqu8        YWORD(%%DSTREG){%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset]
+%elif (blocks_left == 3 || blocks_left == 4)
+        vmovdqu8        %%DSTREG{%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset]
+%endif
+
+%endmacro
+
+;; =============================================================================
+;; Stores specified number of AES blocks from ZMM registers
+%macro ZMM_STORE_BLOCKS_0_16 7
+%define %%NUM_BLOCKS    %1 ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%OUTP          %2 ; [in] output data pointer to write to
+%define %%DATA_OFFSET   %3 ; [in] offset to the output pointer (GP or numerical)
+%define %%SRC0          %4 ; [in] ZMM register with data to store
+%define %%SRC1          %5 ; [in] ZMM register with data to store
+%define %%SRC2          %6 ; [in] ZMM register with data to store
+%define %%SRC3          %7 ; [in] ZMM register with data to store
+
+%assign dst_offset  0
+%assign src_idx     0
+
+%rep (%%NUM_BLOCKS / 4)
+%xdefine %%SRCREG %%SRC %+ src_idx
+        vmovdqu8         [%%OUTP + %%DATA_OFFSET + dst_offset], %%SRCREG
+%undef %%SRCREG
+%assign dst_offset  (dst_offset + 64)
+%assign src_idx     (src_idx + 1)
+%endrep
+
+%assign blocks_left (%%NUM_BLOCKS % 4)
+%xdefine %%SRCREG %%SRC %+ src_idx
+
+%if blocks_left == 1
+        vmovdqu8        [%%OUTP + %%DATA_OFFSET + dst_offset], XWORD(%%SRCREG)
+%elif blocks_left == 2
+        vmovdqu8        [%%OUTP + %%DATA_OFFSET + dst_offset], YWORD(%%SRCREG)
+%elif blocks_left == 3
+        vmovdqu8        [%%OUTP + %%DATA_OFFSET + dst_offset], YWORD(%%SRCREG)
+        vextracti32x4   [%%OUTP + %%DATA_OFFSET + dst_offset + 32], %%SRCREG, 2
+%endif
+
+%endmacro
+
+;; =============================================================================
+;; Stores specified number of AES blocks from ZMM registers with mask register
+;; for the last loaded register (xmm, ymm or zmm).
+;; Stores take place at 1 byte granularity.
+%macro ZMM_STORE_MASKED_BLOCKS_0_16 8
+%define %%NUM_BLOCKS    %1 ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%OUTP          %2 ; [in] output data pointer to write to
+%define %%DATA_OFFSET   %3 ; [in] offset to the output pointer (GP or numerical)
+%define %%SRC0          %4 ; [in] ZMM register with data to store
+%define %%SRC1          %5 ; [in] ZMM register with data to store
+%define %%SRC2          %6 ; [in] ZMM register with data to store
+%define %%SRC3          %7 ; [in] ZMM register with data to store
+%define %%MASK          %8 ; [in] mask register
+
+%assign dst_offset  0
+%assign src_idx     0
+%assign blocks_left %%NUM_BLOCKS
+
+%if %%NUM_BLOCKS > 0
+%rep (((%%NUM_BLOCKS + 3) / 4) - 1)
+%xdefine %%SRCREG %%SRC %+ src_idx
+        vmovdqu8         [%%OUTP + %%DATA_OFFSET + dst_offset], %%SRCREG
+%undef %%SRCREG
+%assign dst_offset  (dst_offset + 64)
+%assign src_idx     (src_idx + 1)
+%assign blocks_left (blocks_left - 4)
+%endrep
+%endif  ; %if %%NUM_BLOCKS > 0
+
+%xdefine %%SRCREG %%SRC %+ src_idx
+
+%if blocks_left == 1
+        vmovdqu8        [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, XWORD(%%SRCREG)
+%elif blocks_left == 2
+        vmovdqu8        [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, YWORD(%%SRCREG)
+%elif (blocks_left == 3 || blocks_left == 4)
+        vmovdqu8        [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, %%SRCREG
+%endif
+
+%endmacro
+
+;;; ===========================================================================
+;;; Handles AES encryption rounds
+;;; It handles special cases: the last and first rounds
+;;; Optionally, it performs XOR with data after the last AES round.
+;;; Uses NROUNDS parameterto check what needs to be done for the current round.
+;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks).
+%macro ZMM_AESENC_ROUND_BLOCKS_0_16 12
+%define %%L0B0_3   %1      ; [in/out] zmm; blocks 0 to 3
+%define %%L0B4_7   %2      ; [in/out] zmm; blocks 4 to 7
+%define %%L0B8_11  %3      ; [in/out] zmm; blocks 8 to 11
+%define %%L0B12_15 %4      ; [in/out] zmm; blocks 12 to 15
+%define %%KEY      %5      ; [in] zmm containing round key
+%define %%ROUND    %6      ; [in] round number
+%define %%D0_3     %7      ; [in] zmm or no_data; plain/cipher text blocks 0-3
+%define %%D4_7     %8      ; [in] zmm or no_data; plain/cipher text blocks 4-7
+%define %%D8_11    %9      ; [in] zmm or no_data; plain/cipher text blocks 8-11
+%define %%D12_15   %10     ; [in] zmm or no_data; plain/cipher text blocks 12-15
+%define %%NUMBL    %11     ; [in] number of blocks; numerical value
+%define %%NROUNDS  %12     ; [in] number of rounds; numerical value
+
+;;; === first AES round
+%if (%%ROUND < 1)
+        ;;  round 0
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%KEY, %%KEY, %%KEY, %%KEY
+%endif                  ; ROUND 0
+
+;;; === middle AES rounds
+%if (%%ROUND >= 1 && %%ROUND <= %%NROUNDS)
+        ;; rounds 1 to 9/11/13
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesenc, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%KEY, %%KEY, %%KEY, %%KEY
+%endif                  ; rounds 1 to 9/11/13
+
+;;; === last AES round
+%if (%%ROUND > %%NROUNDS)
+        ;; the last round - mix enclast with text xor's
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesenclast, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%KEY, %%KEY, %%KEY, %%KEY
+
+;;; === XOR with data
+%ifnidn %%D0_3, no_data
+%ifnidn %%D4_7, no_data
+%ifnidn %%D8_11, no_data
+%ifnidn %%D12_15, no_data
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%D0_3, %%D4_7, %%D8_11, %%D12_15
+%endif                          ; !no_data
+%endif                          ; !no_data
+%endif                          ; !no_data
+%endif                          ; !no_data
+
+%endif                  ; The last round
+
+%endmacro
+
+;;; ===========================================================================
+;;; Handles AES decryption rounds
+;;; It handles special cases: the last and first rounds
+;;; Optionally, it performs XOR with data after the last AES round.
+;;; Uses NROUNDS parameter to check what needs to be done for the current round.
+;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks).
+%macro ZMM_AESDEC_ROUND_BLOCKS_0_16 12
+%define %%L0B0_3   %1      ; [in/out] zmm; blocks 0 to 3
+%define %%L0B4_7   %2      ; [in/out] zmm; blocks 4 to 7
+%define %%L0B8_11  %3      ; [in/out] zmm; blocks 8 to 11
+%define %%L0B12_15 %4      ; [in/out] zmm; blocks 12 to 15
+%define %%KEY      %5      ; [in] zmm containing round key
+%define %%ROUND    %6      ; [in] round number
+%define %%D0_3     %7      ; [in] zmm or no_data; cipher text blocks 0-3
+%define %%D4_7     %8      ; [in] zmm or no_data; cipher text blocks 4-7
+%define %%D8_11    %9      ; [in] zmm or no_data; cipher text blocks 8-11
+%define %%D12_15   %10     ; [in] zmm or no_data; cipher text blocks 12-15
+%define %%NUMBL    %11     ; [in] number of blocks; numerical value
+%define %%NROUNDS  %12     ; [in] number of rounds; numerical value
+
+;;; === first AES round
+%if (%%ROUND < 1)
+        ;;  round 0
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%KEY, %%KEY, %%KEY, %%KEY
+%endif                  ; ROUND 0
+
+;;; === middle AES rounds
+%if (%%ROUND >= 1 && %%ROUND <= %%NROUNDS)
+        ;; rounds 1 to 9/11/13
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesdec, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%KEY, %%KEY, %%KEY, %%KEY
+%endif                  ; rounds 1 to 9/11/13
+
+;;; === last AES round
+%if (%%ROUND > %%NROUNDS)
+        ;; the last round - mix enclast with text xor's
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesdeclast, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%KEY, %%KEY, %%KEY, %%KEY
+
+;;; === XOR with data
+%ifnidn %%D0_3, no_data
+%ifnidn %%D4_7, no_data
+%ifnidn %%D8_11, no_data
+%ifnidn %%D12_15, no_data
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%D0_3, %%D4_7, %%D8_11, %%D12_15
+%endif                          ; !no_data
+%endif                          ; !no_data
+%endif                          ; !no_data
+%endif                          ; !no_data
+
+%endif                  ; The last round
+
+%endmacro
+
+%endif ;; _AES_COMMON_ASM
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm
new file mode 100644
index 000000000..2a879abdd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm
@@ -0,0 +1,431 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; the following defines control the operation of the macros below and
+; need to be defines in the including file
+; KEY_ROUNDS - number of key rounds needed based on key length: 128bit - 11, 192bit - 13 or 256bit - 15
+; EARLY_BLOCKS - number of data block to load before starting computations
+; PARALLEL_BLOCKS - number of blocks of data to process in parallel also the number of xmm regs to reserve for data
+; IV_CNT - number of xmm regs to use for IV data valid values of 0 or 1
+; TMP_CNT - number of tmp xmm register to reserve
+; XMM_USAGE    - number of xmm registers to use. must be at least the same as PARALLEL_BLOCKS + 2
+;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;
+; the following instructions set specific macros must be defined in the user file
+; to make use of the AES macros below
+; MOVDQ - move from memory to xmm reg
+; PXOR - XOR of two xmm registers         pxor
+; AES_DEC - AES block decode for early key rounds
+; AES_DEC_LAST  - AES block decode for last key round
+; or
+; AES_ENC - AES block encode for early key rounds
+; AES_ENC_LAST  - AES block encode for last key round
+
+; Three usages of xmm regs: key round cache, blocks data and one temp
+; CKEY_CNT are (number of xmm regs) - PARALLEL_BLOCKS - IV holder - 2 TMP mmx reg
+%assign FIRST_XDATA     (0)
+%assign IV_IDX         (FIRST_XDATA + PARALLEL_BLOCKS)
+%ifndef IV_CNT
+%define IV_CNT          (1)
+%endif
+%assign TMP             (IV_IDX + IV_CNT)
+%assign TMP_CNT         (2)
+%assign FIRST_CKEY      (TMP + TMP_CNT)
+%assign CKEY_CNT        (XMM_USAGE - (PARALLEL_BLOCKS + IV_CNT + TMP_CNT))
+
+; Abstract xmm register usages that identify the expected contents of the register
+%define reg(i)      xmm %+ i
+%define XDATA(i)    xmm %+ i
+%define KEY_REG(i)  xmm %+ i
+%define IV_REG(i)   xmm %+ i
+
+%define IDX		rax
+
+
+
+
+;
+;
+;	AES CBC ENCODE MACROS
+;
+;
+
+;
+;	CBC_DECRYPT_BLOCKS
+; Decrypts a number of blocks using AES_PARALLEL_ENC_BLOCKS macro
+; Finalized the decryption and saves results in the output
+; places last last buffers crypto text in IV for next buffer
+; updates the index and number of bytes left
+;
+%macro CBC_DECRYPT_BLOCKS 17
+%define %%TOT_ROUNDS	%1
+%define %%num_blocks    %2      ; can be 0..13
+%define %%EARLY_LOADS   %3	; number of data blocks to laod before processing
+%define %%MOVDQ		%4
+%define %%PXOR          %5
+%define %%AES_DEC       %6
+%define %%AES_DEC_LAST  %7
+%define %%CACHED_KEYS   %8	; number of key data cached in xmm regs
+%define %%TMP		%9
+%define %%TMP_CNT	%10
+%define %%FIRST_CKEY    %11
+%define %%KEY_DATA	%12
+%define %%FIRST_XDATA	%13
+%define %%IN		%14	; input data
+%define %%OUT		%15	; output data
+%define %%IDX		%16	; index into input and output data buffers
+%define %%LEN           %17
+
+        AES_PARALLEL_ENC_BLOCKS %%TOT_ROUNDS, %%num_blocks, %%EARLY_LOADS, %%MOVDQ, %%PXOR, %%AES_DEC, %%AES_DEC_LAST, %%CACHED_KEYS, %%TMP, %%TMP_CNT, %%FIRST_CKEY, %%KEY_DATA, %%FIRST_XDATA, %%IN, %%OUT, %%IDX
+
+        ;
+        ; XOR the result of each block's decrypt with the previous block's cypher text (C)
+        ;
+        %assign i 0
+        %rep (%%num_blocks)
+	       %%PXOR	XDATA(i), XDATA(IV_IDX)     		; XOR result with previous block's C
+	       %%MOVDQ	[%%OUT + %%IDX + i*16], XDATA(i)	; save plain text to out
+	       %%MOVDQ	XDATA(IV_IDX), [%%IN + IDX + i*16]	; load IV with current block C
+               %assign i (i+1)
+        %endrep
+
+	add	%%IDX, %%num_blocks*16
+	sub	%%LEN, %%num_blocks*16
+%endmacro
+
+
+;
+;	CBC_ENC_INIT
+; XOR first data block with the IV data
+%macro CBC_ENC_INIT 7
+%define %%P_FIRST	%1
+%define %%IV_IDX	%2
+%define %%MOVDQ		%3
+%define %%PXOR		%4
+%define %%IV		%5
+%define %%IN		%6	; input data
+%define %%IDX		%7	; index into input and output data buffers
+
+	%%MOVDQ	XDATA(%%P_FIRST), [%%IN + %%IDX + 0*16]
+	%%MOVDQ	reg(%%IV_IDX), [%%IV]
+	%%PXOR	XDATA(%%P_FIRST), reg(%%IV_IDX)
+%endmacro
+
+;
+; assumptions:
+; LEN is length of data remaining
+; IDX is offset into the data buffer
+;
+; subloops
+; if data > 16 load next block into a next XDATA reg (XDATA(p_next))
+; load first uncached key into TMP0 (if any)
+; AES block encript XDATA(P_FIRST)
+; if data > 16 XOR next2 block (XDATA(p_next)) with current (XDATA(P_FIRST))
+; save current (XDATA(P_FIRST))
+; update indexes for P_FIRST
+; end if data zero
+;
+%macro CBC_ENC_SUBLOOP 17
+%define %%TOT_ROUNDS	%1
+%define %%BLOCKS	%2      ; can be 1...14
+%define %%START_DATA	%3
+%define %%MOVDQ         %4
+%define %%PXOR          %5
+%define %%AES_DEC       %6
+%define %%AES_DEC_LAST  %7
+%define %%TMP		%8
+%define %%TMP_CNT	%9
+%define %%FIRST_CKEY	%10
+%define %%CKEY_CNT	%11
+%define %%KEYS		%12
+%define %%CACHED_KEYS   %13
+%define %%IN		%14	; input data
+%define %%OUT		%15	; output data
+%define %%IDX		%16	; index into input and output data buffers
+%define %%LEN		%17
+
+	%assign this_blk	0
+	%assign next_blk	1
+	%assign p_first		%%START_DATA
+	%assign p_next		(p_first+1)
+	; for number of blocks to be processed in a loop
+	%assign blk	1
+	%rep %%BLOCKS
+		; if data > 16 load next block into a next XDATA reg (XDATA(p_next))
+		cmp	%%LEN, 16
+		%push	skip_read
+		je	%$skip_read_next
+		%%MOVDQ	XDATA(p_next), [%%IN + %%IDX + next_blk*16]
+		%$skip_read_next:
+		%pop
+
+		AES_ENC_BLOCKS	  %%TOT_ROUNDS, p_first, %%TMP, %%TMP_CNT, %%FIRST_CKEY, %%CKEY_CNT, %%KEYS, %%MOVDQ, %%PXOR, %%AES_DEC, %%AES_DEC_LAST
+
+		; if data > 16 XOR next2 block (XDATA(p_next)) with current (XDATA(p_first))
+		cmp	%%LEN, 16
+		%push	skip_next
+		je %$skip_next_blk_start
+		%%PXOR	XDATA(p_next), XDATA(p_first)
+		%$skip_next_blk_start:
+		%pop
+
+		; save current (XDATA(p_first))
+		%%MOVDQ	[%%OUT + %%IDX + this_blk*16], XDATA(p_first)
+		; update indexes for p_first
+		add	%%IDX, 16
+		sub	%%LEN, 16
+
+		%if (blk < %%BLOCKS) ; only insert jz if NOT last block
+		    ; end if data zero
+		    jz	%%END_CBC_ENC_SUBLOOP
+		%endif ; (p_next < %%BLOCKS)
+
+		%assign p_first	(p_next)
+		%assign blk	(blk+1)
+		%if (blk == %%BLOCKS) ; the last rep loop's read of the next block needs to be into START_DATA
+			%assign p_next (%%START_DATA)
+		%elif (1 == %%BLOCKS)
+			%%MOVDQ	XDATA(%%START_DATA), XDATA(p_next)
+		%else
+			%assign p_next	(p_next+1)
+		%endif
+	%endrep ; %%BLOCKS
+
+	%%END_CBC_ENC_SUBLOOP:
+%endm ; CBC_ENC_SUBLOOP
+
+
+;
+;
+;	AES BLOCK ENCODE MACROS
+;
+;
+
+;
+;	FILL_KEY_CACHE
+; Load key data into the cache key xmm regs
+%macro FILL_KEY_CACHE 4
+%define %%CACHED_KEYS	%1
+%define %%CKEY_START	%2
+%define %%KEY_DATA	%3
+%define %%MOVDQ		%4
+
+	%assign rnd	0
+	%rep	KEY_ROUNDS
+	  %if	(rnd < %%CACHED_KEYS)                   ; find the round's key data
+	  	%assign c	(rnd + %%CKEY_START)
+	        %%MOVDQ	KEY_REG(c), [%%KEY_DATA + rnd*16]	;load sub key into an available register
+	  %endif
+	  %assign rnd	(rnd+1)
+	%endrep
+%endmacro
+
+;
+;	SCHEDULE_DATA_LOAD
+; pre-loades message data into xmm regs
+; updates global 'blocks_loaded' that tracks which data blocks have been loaded
+; 'blocks_loaded' is an in/out global and must be declared in the using macro or function
+%macro SCHEDULE_DATA_LOAD 5
+%define %%PARALLEL_DATA	%1
+%define %%EARLY_LOADS 	%2
+%define %%MOVDQ         %3
+%define %%IN		%4
+%define %%IDX		%5
+
+        %if (blocks_loaded < %%PARALLEL_DATA)
+                ; load cipher text
+                %%MOVDQ  XDATA(blocks_loaded), [%%IN + %%IDX + blocks_loaded*16]
+                %assign blocks_loaded (blocks_loaded+1)
+        %endif ; (blocks_loaded < %%PARALLEL_DATA)
+%endmacro ; SCHEDULED_EARLY_DATA_LOADS
+
+;
+;	INIT_SELECT_KEY
+; determine which xmm reg holds the key data needed or loades it into the temp register if not cached
+; 'current_tmp' is an in/out global and must be declared in the using macro or function
+%macro INIT_SELECT_KEY 6
+%define %%TOT_ROUNDS	%1
+%define %%CACHED_KEYS	%2
+%define %%KEY_DATA	%3
+%define %%FIRST_TMP	%4
+%define %%TMP_CNT	%5
+%define %%MOVDQ		%6
+
+	%assign current_tmp (%%FIRST_TMP)
+	%if (%%TOT_ROUNDS > %%CACHED_KEYS)		; load the first uncached key into temp reg
+		%%MOVDQ	KEY_REG(current_tmp), [%%KEY_DATA + %%CACHED_KEYS*16]
+	%endif ; (KEY_ROUNDS > CKEY_CNT)
+%endmacro ; SELECT_KEY
+
+;
+;	SELECT_KEY
+; determine which xmm reg holds the key data needed or loades it into the temp register if not cached
+; 'current_tmp' is an in/out global and must be declared in the using macro or function
+%macro SELECT_KEY 8
+%define %%ROUND		%1
+%define %%TOT_ROUNDS    %2
+%define %%CACHED_KEYS	%3
+%define %%FIRST_KEY	%4
+%define %%KEY_DATA	%5
+%define %%FIRST_TMP     %6
+%define %%TMP_CNT	%7
+%define %%MOVDQ		%8
+
+	; find the key data for this round
+	%if (%%ROUND < %%CACHED_KEYS)                   ; is it cached
+		%assign key (%%ROUND + %%FIRST_KEY)
+	%else
+		; Load non-cached key %%ROUND data ping-ponging between temp regs if more than one
+		%assign key (current_tmp)                              ; use the previous loaded key data
+		%if (1 == %%TMP_CNT)
+			%%MOVDQ	KEY_REG(current_tmp), [%%KEY_DATA + %%ROUND*16] ; load the next rounds key data
+		%else
+			%assign next_round (%%ROUND+1)
+			%if (next_round < %%TOT_ROUNDS)                      ; if more rounds to be done
+				%if (current_tmp == %%FIRST_TMP)                            ; calc the next temp reg to use
+					%assign current_tmp (current_tmp + 1)
+				%else
+					%assign current_tmp (%%FIRST_TMP)
+				%endif ; (current_tmp == %%FIRST_TMP)
+				%%MOVDQ	KEY_REG(current_tmp), [%%KEY_DATA + next_round*16] ; load the next rounds key data
+
+			%endif ; (%%ROUND < KEY_ROUNDS)
+		%endif ; (1 < %%TMP_CNT)
+	%endif ; (%%ROUND < %%CACHED_KEYS)
+%endmacro ; SELECT_KEY
+
+
+;
+;	AES_PARALLEL_ENC_BLOCKS
+; preloads some data blocks to be worked on
+; starts the aes block encoding while loading the other blocks to be done in parallel
+; aes block encodes each key round on each block
+%macro AES_PARALLEL_ENC_BLOCKS 16
+%define %%KEY_ROUNDS	%1
+%define %%PARALLEL_DATA	%2
+%define %%EARLY_LOADS	%3
+%define %%MOVDQ		%4
+%define %%PXOR		%5
+%define %%AES_DEC	%6
+%define %%AES_DEC_LAST	%7
+%define %%CACHED_KEYS	%8
+%define %%TMP		%9
+%define %%TMP_CNT	%10
+%define %%FIRST_CKEY    %11
+%define %%KEY_DATA	%12
+%define %%FIRST_XDATA   %13
+%define %%IN		%14	; input data
+%define %%OUT		%15	; output data
+%define %%IDX		%16	; index into input and output data buffers
+
+	%assign	blocks_loaded	0
+
+	%rep	%%EARLY_LOADS
+		SCHEDULE_DATA_LOAD  %%PARALLEL_DATA, %%EARLY_LOADS, %%MOVDQ, %%IN, %%IDX ; updates blocks_loaded
+	%endrep ; %%EARLY_LOADS
+
+	%assign current_tmp (TMP)
+	INIT_SELECT_KEY  %%KEY_ROUNDS, %%CACHED_KEYS, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
+
+	%assign	round	0
+	%assign	key	0
+	%rep	KEY_ROUNDS			; for all key rounds
+		SELECT_KEY round, %%KEY_ROUNDS, %%CACHED_KEYS, %%FIRST_CKEY, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
+
+		%assign	i	%%FIRST_XDATA
+		%rep 	%%PARALLEL_DATA		; for each block do the EAS block encode step
+			%if	(0 == round)
+				%%PXOR		XDATA(i), KEY_REG(key)		         ; first round's step
+				SCHEDULE_DATA_LOAD  %%PARALLEL_DATA, %%EARLY_LOADS, %%MOVDQ, %%IN, %%IDX
+
+			%elif	( (%%KEY_ROUNDS-1) == round )
+				%%AES_DEC_LAST	XDATA(i), KEY_REG(key)		 ; last round's step
+
+			%else
+			        %%AES_DEC	XDATA(i), KEY_REG(key)		 ; middle round's (1..last-1) step
+
+		        %endif
+		        %assign i (i+1)
+		%endrep ;%%PARALLEL_DATA
+		%assign round (round+1)
+	%endrep ;KEY_ROUNDS
+%endmacro ; AES_PARALLEL_ENC_BLOCKS
+
+
+
+;
+;	AES_ENC_BLOCKS
+; load first uncached key into TMP0 (if any)
+; AES block encript XDATA(p_first)
+;   before using uncached key in TMP0, load next key in TMP1
+;   before using uncached key in TMP1, load next key in TMP0
+%macro AES_ENC_BLOCKS 11
+%define %%TOT_ROUNDS    %1
+%define %%ENC_BLOCK	%2
+%define %%TMP		%3
+%define %%TMP_CNT	%4
+%define %%FIRST_CKEY    %5
+%define %%CACHED_KEYS	%6
+%define %%KEY_DATA	%7
+%define %%MOVDQ		%8
+%define %%PXOR		%9
+%define %%AES_ENC	%10
+%define %%AES_ENC_LAST	%11
+
+	%assign current_tmp (%%TMP)
+	INIT_SELECT_KEY %%TOT_ROUNDS, %%CACHED_KEYS, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
+
+	%assign round	0
+	%assign key	(round + %%FIRST_CKEY)
+	%rep %%TOT_ROUNDS                                 ; for all key rounds
+		; find the key data for this round
+		SELECT_KEY round, %%TOT_ROUNDS, %%CACHED_KEYS, %%FIRST_CKEY, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
+
+		; encrypt block
+		%if (0 == round)
+			%%PXOR	XDATA(%%ENC_BLOCK), KEY_REG(key)		; round zero step
+		%elif ( (%%TOT_ROUNDS-1) == round )
+			%%AES_ENC_LAST	XDATA(%%ENC_BLOCK), KEY_REG(key)	; last round's step
+		%else
+			%%AES_ENC	XDATA(%%ENC_BLOCK), KEY_REG(key)	; rounds 1..last-1 step
+		%endif ; (0 == round)
+
+		%assign round	(round+1)
+	%endrep ; KEY_ROUNDS
+%endmacro ; AES_ENC
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm
new file mode 100644
index 000000000..68aa227ca
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm
@@ -0,0 +1,162 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES cbc decrypt on 16n bytes doing AES by 4
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_cbc_dec_128_sse(void     *in,
+;                          uint8_t  *IV,
+;                          uint8_t   keys,
+;                          void     *out,
+;                          uint64_t  len_bytes);
+;
+; arg 1: IN:   pointer to input (cipher text)
+; arg 2: IV:   pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT:  pointer to output (plain text)
+; arg 5: LEN:  length in bytes (multiple of 16)
+;
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN		rdi
+%define IV		rsi
+%define KEYS		rdx
+%define OUT		rcx
+%define LEN		r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN		rcx
+%define IV		rdx
+%define KEYS		r8
+%define OUT		r9
+%define LEN		r10
+%define PS		8
+%define stack_size	10*16 + 1*8	; must be an odd multiple of 8
+%define arg(x)		[rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	end_prolog
+	mov	LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	add	rsp, stack_size
+%endmacro
+
+%endif
+
+; configuration paramaters for AES-CBC macros
+%define KEY_ROUNDS 11
+%define XMM_USAGE    (16)
+%define EARLY_BLOCKS (2)
+%define PARALLEL_BLOCKS (8)
+%define IV_CNT          (1)
+
+; instruction set specific operation definitions
+%define MOVDQ         movdqu
+%define PXOR          pxor
+%define AES_DEC       aesdec
+%define AES_DEC_LAST  aesdeclast
+%include "cbc_common.asm"
+
+section .text
+
+align 16
+mk_global aes_cbc_dec_128_sse, function
+func(aes_cbc_dec_128_sse)
+	endbranch
+	FUNC_SAVE
+
+        FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+        MOVDQ  reg(IV_IDX), [IV]         ; Load IV for next round of block decrypt
+        mov IDX, 0
+ 	cmp	LEN, PARALLEL_BLOCKS*16
+	jge	main_loop                ; if enough data blocks remain enter main_loop
+	jmp  partials
+
+main_loop:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	cmp	LEN, PARALLEL_BLOCKS*16
+	jge	main_loop                ; enough blocks to do another full parallel set
+	jz  done
+
+partials:                                ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+	cmp	LEN, 0
+	je	done
+	cmp	LEN, 4*16
+	jge	initial_4
+	cmp	LEN, 2*16
+	jge	initial_2
+
+initial_1:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	jmp	done
+
+initial_2:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	jz  done
+	jmp	partials
+
+initial_4:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	jnz	partials
+
+done:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm
new file mode 100644
index 000000000..d4b6dfb2a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm
@@ -0,0 +1,162 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; routine to do AES128 CBC decrypt
+;; clobbers xmm0-15
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN		rdi
+%define IV		rsi
+%define KEYS		rdx
+%define OUT		rcx
+%define LEN		r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN		rcx
+%define IV		rdx
+%define KEYS		r8
+%define OUT		r9
+%define LEN		r10
+%define PS		8
+%define stack_size	10*16 + 1*8	; must be an odd multiple of 8
+%define arg(x)		[rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	end_prolog
+	mov	LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	add	rsp, stack_size
+%endmacro
+
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 11
+%define XMM_USAGE    (16)
+%define EARLY_BLOCKS (4)
+%define PARALLEL_BLOCKS (11)
+%define IV_CNT          (1)
+
+; instruction set specific operation definitions
+%define MOVDQ         vmovdqu
+%macro PXOR 2
+   vpxor %1, %1, %2
+%endm
+
+%macro AES_DEC 2
+  vaesdec %1, %1, %2
+%endm
+
+%macro AES_DEC_LAST 2
+  vaesdeclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+section .text
+
+;; aes_cbc_dec_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+mk_global aes_cbc_dec_128_avx, function
+func(aes_cbc_dec_128_avx)
+	endbranch
+	FUNC_SAVE
+
+        FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+        MOVDQ  reg(IV_IDX), [IV]         ; Load IV for next round of block decrypt
+        mov IDX, 0
+        cmp     LEN, PARALLEL_BLOCKS*16
+        jge     main_loop                ; if enough data blocks remain enter main_loop
+        jmp  partials
+
+main_loop:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	cmp	LEN, PARALLEL_BLOCKS*16
+	jge	main_loop                ; enough blocks to do another full parallel set
+	jz  done
+
+partials:                                ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+	cmp	LEN, 0
+	je	done
+	cmp	LEN, 4*16
+	jge	initial_4
+	cmp	LEN, 2*16
+	jge	initial_2
+
+initial_1:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	jmp	done
+
+initial_2:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	jz  done
+	jmp	partials
+
+initial_4:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	jnz	partials
+done:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm
new file mode 100644
index 000000000..4b017d193
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm
@@ -0,0 +1,164 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES cbc decrypt on 16n bytes doing AES
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_cbc_dec_192_sse(void    *in,
+;                          uint8_t *IV,
+;                          uint8_t  keys[13], // +1 over key length
+;                          void    *out,
+;                          uint64_t len_bytes);
+;
+; arg 1: IN:   pointer to input (cipher text)
+; arg 2: IV:   pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT:  pointer to output (plain text)
+; arg 5: LEN:  length in bytes (multiple of 16)
+;
+
+%include "reg_sizes.asm"
+
+%define MOVDQ	movdqu
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN		rdi
+%define IV		rsi
+%define KEYS		rdx
+%define OUT		rcx
+%define LEN		r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN		rcx
+%define IV		rdx
+%define KEYS		r8
+%define OUT		r9
+%define LEN		r10
+%define PS		8
+%define stack_size	10*16 + 1*8	; must be an odd multiple of 8
+%define arg(x)		[rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	end_prolog
+	mov	LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	add	rsp, stack_size
+%endmacro
+
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 13
+%define XMM_USAGE    (16)
+%define EARLY_BLOCKS (2)
+%define PARALLEL_BLOCKS (5)
+%define IV_CNT          (1)
+
+; instruction set specific operation definitions
+%define MOVDQ         movdqu
+%define PXOR          pxor
+%define AES_DEC       aesdec
+%define AES_DEC_LAST  aesdeclast
+
+%include "cbc_common.asm"
+
+section .text
+
+mk_global aes_cbc_dec_192_sse, function
+func(aes_cbc_dec_192_sse)
+	endbranch
+	FUNC_SAVE
+
+        FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+        MOVDQ  reg(IV_IDX), [IV]         ; Load IV for next round of block decrypt
+        mov IDX, 0
+        cmp     LEN, PARALLEL_BLOCKS*16
+        jge     main_loop                ; if enough data blocks remain enter main_loop
+        jmp  partials
+
+main_loop:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	cmp	LEN, PARALLEL_BLOCKS*16
+	jge	main_loop                ; enough blocks to do another full parallel set
+	jz  done
+
+partials:                                ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+	cmp	LEN, 0
+	je	done
+	cmp	LEN, 4*16
+	jge	initial_4
+	cmp	LEN, 2*16
+	jge	initial_2
+
+initial_1:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	jmp	done
+
+initial_2:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	jz  done
+	jmp	partials
+
+initial_4:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	jnz	partials
+done:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm
new file mode 100644
index 000000000..2791570ad
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm
@@ -0,0 +1,158 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES192 CBC decrypt
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN		rdi
+%define IV		rsi
+%define KEYS		rdx
+%define OUT		rcx
+%define LEN		r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN		rcx
+%define IV		rdx
+%define KEYS		r8
+%define OUT		r9
+%define LEN		r10
+%define PS		8
+%define stack_size	10*16 + 1*8	; must be an odd multiple of 8
+%define arg(x)		[rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	end_prolog
+	mov	LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	add	rsp, stack_size
+%endmacro
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 13
+%define XMM_USAGE    (16)
+%define EARLY_BLOCKS (4)
+%define PARALLEL_BLOCKS (11)
+%define IV_CNT          (1)
+
+; instruction set specific operation definitions
+%define MOVDQ         vmovdqu
+%macro PXOR 2
+   vpxor %1, %1, %2
+%endm
+
+%macro AES_DEC 2
+  vaesdec %1, %1, %2
+%endm
+
+%macro AES_DEC_LAST 2
+  vaesdeclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+mk_global aes_cbc_dec_192_avx, function
+func(aes_cbc_dec_192_avx)
+	endbranch
+	FUNC_SAVE
+
+        FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+        MOVDQ  reg(IV_IDX), [IV]         ; Load IV for next round of block decrypt
+        mov IDX, 0
+        cmp     LEN, PARALLEL_BLOCKS*16
+        jge     main_loop                ; if enough data blocks remain enter main_loop
+        jmp  partials
+
+main_loop:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	cmp	LEN, PARALLEL_BLOCKS*16
+	jge	main_loop                ; enough blocks to do another full parallel set
+	jz  done
+
+partials:                                ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+	cmp	LEN, 0
+	je	done
+	cmp	LEN, 4*16
+	jge	initial_4
+	cmp	LEN, 2*16
+	jge	initial_2
+
+initial_1:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	jmp	done
+
+initial_2:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	jz  done
+	jmp	partials
+
+initial_4:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	jnz	partials
+done:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm
new file mode 100644
index 000000000..44c76268e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm
@@ -0,0 +1,161 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES cbc decrypt on 16n bytes doing AES
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_cbc_dec_256_sse(void    *in,
+;                          uint8_t *IV,
+;                          uint8_t  keys,
+;                          void    *out,
+;                          uint64_t len_bytes);
+;
+; arg 1: rcx: pointer to input (cipher text)
+; arg 2: rdx: pointer to IV
+; arg 3: r8:  pointer to keys
+; arg 4: r9:  pointer to output (plain text)
+; arg 5: sp:  length in bytes (multiple of 16)
+;
+
+%include "reg_sizes.asm"
+
+%define MOVDQ	movdqu
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN		rdi
+%define IV		rsi
+%define KEYS		rdx
+%define OUT		rcx
+%define LEN		r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN		rcx
+%define IV		rdx
+%define KEYS		r8
+%define OUT		r9
+%define LEN		r10
+%define PS		8
+%define stack_size	10*16 + 1*8	; must be an odd multiple of 8
+%define arg(x)		[rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	end_prolog
+	mov	LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	add	rsp, stack_size
+%endmacro
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 15
+%define XMM_USAGE    (16)
+%define EARLY_BLOCKS (4)
+%define PARALLEL_BLOCKS (11)
+%define IV_CNT          (1)
+
+; instruction set specific operation definitions
+%define MOVDQ         movdqu
+%define PXOR          pxor
+%define AES_DEC       aesdec
+%define AES_DEC_LAST  aesdeclast
+
+%include "cbc_common.asm"
+
+mk_global aes_cbc_dec_256_sse, function
+func(aes_cbc_dec_256_sse)
+	endbranch
+	FUNC_SAVE
+
+        FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+        MOVDQ  reg(IV_IDX), [IV]         ; Load IV for next round of block decrypt
+        mov IDX, 0
+        cmp     LEN, PARALLEL_BLOCKS*16
+        jge     main_loop                ; if enough data blocks remain enter main_loop
+        jmp  partials
+
+main_loop:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	cmp	LEN, PARALLEL_BLOCKS*16
+	jge	main_loop                ; enough blocks to do another full parallel set
+	jz  done
+
+partials:                                ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+	cmp	LEN, 0
+	je	done
+	cmp	LEN, 4*16
+	jge	initial_4
+	cmp	LEN, 2*16
+	jge	initial_2
+
+initial_1:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	jmp	done
+
+initial_2:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	jz  done
+	jmp	partials
+
+initial_4:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	jnz	partials
+done:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm
new file mode 100644
index 000000000..cad1a6bef
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm
@@ -0,0 +1,158 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES256 CBC decrypt
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN		rdi
+%define IV		rsi
+%define KEYS		rdx
+%define OUT		rcx
+%define LEN		r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN		rcx
+%define IV		rdx
+%define KEYS		r8
+%define OUT		r9
+%define LEN		r10
+%define PS		8
+%define stack_size	10*16 + 1*8	; must be an odd multiple of 8
+%define arg(x)		[rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	end_prolog
+	mov	LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	add	rsp, stack_size
+%endmacro
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 15
+%define XMM_USAGE    (16)
+%define EARLY_BLOCKS (4)
+%define PARALLEL_BLOCKS (11)
+%define IV_CNT          (1)
+
+; instruction set specific operation definitions
+%define MOVDQ         vmovdqu
+%macro PXOR 2
+   vpxor %1, %1, %2
+%endm
+
+%macro AES_DEC 2
+  vaesdec %1, %1, %2
+%endm
+
+%macro AES_DEC_LAST 2
+  vaesdeclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+mk_global aes_cbc_dec_256_avx, function
+func(aes_cbc_dec_256_avx)
+	endbranch
+	FUNC_SAVE
+
+        FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+        MOVDQ  reg(IV_IDX), [IV]         ; Load IV for next round of block decrypt
+        mov IDX, 0
+        cmp     LEN, PARALLEL_BLOCKS*16
+        jge     main_loop                ; if enough data blocks remain enter main_loop
+        jmp  partials
+
+main_loop:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	cmp	LEN, PARALLEL_BLOCKS*16
+	jge	main_loop                ; enough blocks to do another full parallel set
+	jz  done
+
+partials:                                ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+	cmp	LEN, 0
+	je	done
+	cmp	LEN, 4*16
+	jge	initial_4
+	cmp	LEN, 2*16
+	jge	initial_2
+
+initial_1:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	jmp	done
+
+initial_2:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	jz  done
+	jmp	partials
+
+initial_4:
+        CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+	jnz	partials
+done:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_vaes_avx512.asm
new file mode 100644
index 000000000..6124e2def
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_vaes_avx512.asm
@@ -0,0 +1,519 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2019-2021 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "aes_common.asm"
+%include "reg_sizes.asm"
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+[bits 64]
+default rel
+
+%define zIV        zmm0
+%define zBLK_0_3   zmm1
+%define zBLK_4_7   zmm2
+%define zBLK_8_11  zmm3
+%define zBLK_12_15 zmm4
+%define zTMP0      zmm5
+%define zTMP1      zmm6
+%define zTMP2      zmm7
+%define zTMP3      zmm8
+
+%define ZKEY0      zmm17
+%define ZKEY1      zmm18
+%define ZKEY2      zmm19
+%define ZKEY3      zmm20
+%define ZKEY4      zmm21
+%define ZKEY5      zmm22
+%define ZKEY6      zmm23
+%define ZKEY7      zmm24
+%define ZKEY8      zmm25
+%define ZKEY9      zmm26
+%define ZKEY10     zmm27
+%define ZKEY11     zmm28
+%define ZKEY12     zmm29
+%define ZKEY13     zmm30
+%define ZKEY14     zmm31
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define p_in       rdi
+%define p_IV       rsi
+%define p_keys     rdx
+%define p_out      rcx
+%define num_bytes  r8
+%else
+%define p_in       rcx
+%define p_IV       rdx
+%define p_keys     r8
+%define p_out      r9
+%define num_bytes  rax
+%endif
+
+%define tmp        r10
+%define tmp2       r11
+
+%ifdef CBCS
+%define OFFSET 160
+%else
+%define OFFSET 16
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; macro to preload keys
+;;; - uses ZKEY[0-14] registers (ZMM)
+%macro LOAD_KEYS 2
+%define %%KEYS          %1      ; [in] key pointer
+%define %%NROUNDS       %2      ; [in] numerical value, number of AES rounds
+                                ;      excluding 1st and last rounds.
+                                ;      Example: AES-128 -> value 9
+
+%assign i 0
+%rep (%%NROUNDS + 2)
+        vbroadcastf64x2 ZKEY %+ i, [%%KEYS + 16*i]
+%assign i (i + 1)
+%endrep
+
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; This macro is used to "cool down" pipeline after DECRYPT_16_PARALLEL macro
+;;; code as the number of final blocks is variable.
+;;; Processes the last %%num_final_blocks blocks (1 to 15, can't be 0)
+
+%macro FINAL_BLOCKS 14
+%define %%PLAIN_OUT             %1      ; [in] output buffer
+%define %%CIPH_IN               %2      ; [in] input buffer
+%define %%LAST_CIPH_BLK         %3      ; [in/out] ZMM with IV/last cipher blk (in idx 3)
+%define %%num_final_blocks      %4      ; [in] numerical value (1 - 15)
+%define %%CIPHER_PLAIN_0_3      %5      ; [out] ZMM next 0-3 cipher blocks
+%define %%CIPHER_PLAIN_4_7      %6      ; [out] ZMM next 4-7 cipher blocks
+%define %%CIPHER_PLAIN_8_11     %7      ; [out] ZMM next 8-11 cipher blocks
+%define %%CIPHER_PLAIN_12_15    %8      ; [out] ZMM next 12-15 cipher blocks
+%define %%ZT1                   %9      ; [clobbered] ZMM temporary
+%define %%ZT2                   %10     ; [clobbered] ZMM temporary
+%define %%ZT3                   %11     ; [clobbered] ZMM temporary
+%define %%ZT4                   %12     ; [clobbered] ZMM temporary
+%define %%IA0                   %13     ; [clobbered] GP temporary
+%define %%NROUNDS               %14     ; [in] number of rounds; numerical value
+
+        ;; load plain/cipher text
+%ifdef CBCS
+        ZMM_LOAD_BLOCKS_0_16_OFFSET %%num_final_blocks, %%CIPH_IN, \
+                OFFSET, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+                %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15
+%else
+        ZMM_LOAD_BLOCKS_0_16 %%num_final_blocks, %%CIPH_IN, 0, \
+                %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+                %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15
+%endif
+        ;; Prepare final cipher text blocks to
+        ;; be XOR'd later after AESDEC
+        valignq         %%ZT1, %%CIPHER_PLAIN_0_3, %%LAST_CIPH_BLK, 6
+%if %%num_final_blocks > 4
+        valignq         %%ZT2, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_0_3, 6
+%endif
+%if %%num_final_blocks > 8
+        valignq         %%ZT3, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_4_7, 6
+%endif
+%if %%num_final_blocks > 12
+        valignq         %%ZT4, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_8_11, 6
+%endif
+
+        ;; Update IV with last cipher block
+        ;; to be used later in DECRYPT_16_PARALLEL
+%if %%num_final_blocks == 1
+        valignq         %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, 2
+%elif %%num_final_blocks == 2
+        valignq         %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, 4
+%elif %%num_final_blocks == 3
+        valignq         %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, 6
+%elif %%num_final_blocks == 4
+        vmovdqa64       %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3
+%elif %%num_final_blocks == 5
+        valignq         %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, 2
+%elif %%num_final_blocks == 6
+        valignq         %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, 4
+%elif %%num_final_blocks == 7
+        valignq         %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, 6
+%elif %%num_final_blocks == 8
+        vmovdqa64       %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7
+%elif %%num_final_blocks == 9
+        valignq         %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, 2
+%elif %%num_final_blocks == 10
+        valignq         %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, 4
+%elif %%num_final_blocks == 11
+        valignq         %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, 6
+%elif %%num_final_blocks == 12
+        vmovdqa64       %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11
+%elif %%num_final_blocks == 13
+        valignq         %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, 2
+%elif %%num_final_blocks == 14
+        valignq         %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, 4
+%elif %%num_final_blocks == 15
+        valignq         %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, 6
+%endif
+
+        ;; AES rounds
+%assign j 0
+%rep (%%NROUNDS + 2)
+     ZMM_AESDEC_ROUND_BLOCKS_0_16 %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+                        %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15, \
+                        ZKEY %+ j, j, no_data, no_data, no_data, no_data, \
+                        %%num_final_blocks, %%NROUNDS
+%assign j (j + 1)
+%endrep
+
+        ;; XOR with decrypted blocks to get plain text
+        vpxorq          %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, %%ZT1
+%if %%num_final_blocks > 4
+        vpxorq          %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, %%ZT2
+%endif
+%if %%num_final_blocks > 8
+        vpxorq          %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, %%ZT3
+%endif
+%if %%num_final_blocks > 12
+        vpxorq          %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, %%ZT4
+%endif
+
+        ;; write plain text back to output
+%ifdef CBCS
+        ZMM_STORE_BLOCKS_0_16_OFFSET %%num_final_blocks, %%PLAIN_OUT, \
+                OFFSET, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+                %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15
+%else
+        ZMM_STORE_BLOCKS_0_16 %%num_final_blocks, %%PLAIN_OUT, 0, \
+                %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+                %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15
+%endif
+
+%endmacro       ; FINAL_BLOCKS
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Main AES-CBC decrypt macro
+;;; - operates on single stream
+;;; - decrypts 16 blocks at a time
+%macro DECRYPT_16_PARALLEL 14
+%define %%PLAIN_OUT             %1      ; [in] output buffer
+%define %%CIPH_IN               %2      ; [in] input buffer
+%define %%LENGTH                %3      ; [in/out] number of bytes to process
+%define %%LAST_CIPH_BLK         %4      ; [in/out] ZMM with IV (first block) or last cipher block (idx 3)
+%define %%CIPHER_PLAIN_0_3      %5      ; [out] ZMM next 0-3 cipher blocks
+%define %%CIPHER_PLAIN_4_7      %6      ; [out] ZMM next 4-7 cipher blocks
+%define %%CIPHER_PLAIN_8_11     %7      ; [out] ZMM next 8-11 cipher blocks
+%define %%CIPHER_PLAIN_12_15    %8      ; [out] ZMM next 12-15 cipher blocks
+%define %%ZT1                   %9      ; [clobbered] ZMM temporary
+%define %%ZT2                   %10     ; [clobbered] ZMM temporary
+%define %%ZT3                   %11     ; [clobbered] ZMM temporary
+%define %%ZT4                   %12     ; [clobbered] ZMM temporary
+%define %%NROUNDS               %13     ; [in] number of rounds; numerical value
+%define %%IA0                   %14     ; [clobbered] GP temporary
+
+%ifdef CBCS
+       ZMM_LOAD_BLOCKS_0_16_OFFSET 16, %%CIPH_IN, OFFSET, \
+                %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+                %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15
+%else
+        vmovdqu8        %%CIPHER_PLAIN_0_3, [%%CIPH_IN]
+        vmovdqu8        %%CIPHER_PLAIN_4_7, [%%CIPH_IN + 64]
+        vmovdqu8        %%CIPHER_PLAIN_8_11, [%%CIPH_IN + 128]
+        vmovdqu8        %%CIPHER_PLAIN_12_15, [%%CIPH_IN + 192]
+%endif
+        ;; prepare first set of cipher blocks for later XOR'ing
+        valignq         %%ZT1, %%CIPHER_PLAIN_0_3, %%LAST_CIPH_BLK, 6
+        valignq         %%ZT2, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_0_3, 6
+        valignq         %%ZT3, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_4_7, 6
+        valignq         %%ZT4, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_8_11, 6
+
+        ;; store last cipher text block to be used for next 16 blocks
+        vmovdqa64       %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15
+
+        ;; AES rounds
+%assign j 0
+%rep (%%NROUNDS + 2)
+        ZMM_AESDEC_ROUND_BLOCKS_0_16 %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+                        %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15, \
+                        ZKEY %+ j, j, no_data, no_data, no_data, no_data, \
+                        16, %%NROUNDS
+%assign j (j + 1)
+%endrep
+
+        ;; XOR with decrypted blocks to get plain text
+        vpxorq          %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, %%ZT1
+        vpxorq          %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, %%ZT2
+        vpxorq          %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, %%ZT3
+        vpxorq          %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, %%ZT4
+
+        ;; write plain text back to output
+%ifdef CBCS
+       ZMM_STORE_BLOCKS_0_16_OFFSET 16, %%PLAIN_OUT, OFFSET, \
+                %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+                %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15
+%else
+        vmovdqu8        [%%PLAIN_OUT], %%CIPHER_PLAIN_0_3
+        vmovdqu8        [%%PLAIN_OUT + 64], %%CIPHER_PLAIN_4_7
+        vmovdqu8        [%%PLAIN_OUT + 128], %%CIPHER_PLAIN_8_11
+        vmovdqu8        [%%PLAIN_OUT + 192], %%CIPHER_PLAIN_12_15
+%endif
+        ;; adjust input pointer and length
+        sub             %%LENGTH, (16 * 16)
+        add             %%CIPH_IN, (16 * OFFSET)
+        add             %%PLAIN_OUT, (16 * OFFSET)
+
+%endmacro       ; DECRYPT_16_PARALLEL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; AES_CBC_DEC macro decrypts given data.
+;;; Flow:
+;;; - Decrypt all blocks (multiple of 16) up to final 1-15 blocks
+;;; - Decrypt final blocks (1-15 blocks)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro AES_CBC_DEC 7
+%define %%CIPH_IN       %1 ;; [in] pointer to input buffer
+%define %%PLAIN_OUT     %2 ;; [in] pointer to output buffer
+%define %%KEYS          %3 ;; [in] pointer to expanded keys
+%define %%IV            %4 ;; [in] pointer to IV
+%define %%LENGTH        %5 ;; [in/out] GP register with length in bytes
+%define %%NROUNDS       %6 ;; [in] Number of AES rounds; numerical value
+%define %%TMP           %7 ;; [clobbered] GP register
+
+        cmp     %%LENGTH, 0
+        je      %%cbc_dec_done
+
+        vinserti64x2 zIV, zIV, [%%IV], 3
+
+        ;; preload keys
+        LOAD_KEYS %%KEYS, %%NROUNDS
+
+%%decrypt_16_parallel:
+        cmp     %%LENGTH, 256
+        jb      %%final_blocks
+
+        DECRYPT_16_PARALLEL %%PLAIN_OUT, %%CIPH_IN, %%LENGTH, zIV, \
+                            zBLK_0_3, zBLK_4_7, zBLK_8_11, zBLK_12_15, \
+                            zTMP0, zTMP1, zTMP2, zTMP3, %%NROUNDS, %%TMP
+        jmp     %%decrypt_16_parallel
+
+%%final_blocks:
+        ;; get num final blocks
+        shr     %%LENGTH, 4
+        and     %%LENGTH, 0xf
+        je      %%cbc_dec_done
+
+        cmp     %%LENGTH, 8
+        je      %%final_num_blocks_is_8
+        jl      %%final_blocks_is_1_7
+
+        ; Final blocks 9-15
+        cmp     %%LENGTH, 12
+        je      %%final_num_blocks_is_12
+        jl      %%final_blocks_is_9_11
+
+        ; Final blocks 13-15
+        cmp     %%LENGTH, 15
+        je      %%final_num_blocks_is_15
+        cmp     %%LENGTH, 14
+        je      %%final_num_blocks_is_14
+        cmp     %%LENGTH, 13
+        je      %%final_num_blocks_is_13
+
+%%final_blocks_is_9_11:
+        cmp     %%LENGTH, 11
+        je      %%final_num_blocks_is_11
+        cmp     %%LENGTH, 10
+        je      %%final_num_blocks_is_10
+        cmp     %%LENGTH, 9
+        je      %%final_num_blocks_is_9
+
+%%final_blocks_is_1_7:
+        cmp     %%LENGTH, 4
+        je      %%final_num_blocks_is_4
+        jl      %%final_blocks_is_1_3
+
+        ; Final blocks 5-7
+        cmp     %%LENGTH, 7
+        je      %%final_num_blocks_is_7
+        cmp     %%LENGTH, 6
+        je      %%final_num_blocks_is_6
+        cmp     %%LENGTH, 5
+        je      %%final_num_blocks_is_5
+
+%%final_blocks_is_1_3:
+        cmp     %%LENGTH, 3
+        je      %%final_num_blocks_is_3
+        cmp     %%LENGTH, 2
+        je      %%final_num_blocks_is_2
+        jmp     %%final_num_blocks_is_1
+
+
+%%final_num_blocks_is_15:
+        FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 15, zBLK_0_3, zBLK_4_7, \
+                     zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+                     %%TMP, %%NROUNDS
+        jmp     %%cbc_dec_done
+
+%%final_num_blocks_is_14:
+        FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 14, zBLK_0_3, zBLK_4_7, \
+                     zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+                     %%TMP, %%NROUNDS
+        jmp     %%cbc_dec_done
+
+%%final_num_blocks_is_13:
+        FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 13, zBLK_0_3, zBLK_4_7, \
+                     zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+                     %%TMP, %%NROUNDS
+        jmp     %%cbc_dec_done
+
+%%final_num_blocks_is_12:
+        FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 12, zBLK_0_3, zBLK_4_7, \
+                     zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+                     %%TMP, %%NROUNDS
+        jmp     %%cbc_dec_done
+
+%%final_num_blocks_is_11:
+        FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 11, zBLK_0_3, zBLK_4_7, \
+                     zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+                     %%TMP, %%NROUNDS
+        jmp     %%cbc_dec_done
+
+%%final_num_blocks_is_10:
+        FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 10, zBLK_0_3, zBLK_4_7, \
+                     zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+                     %%TMP, %%NROUNDS
+        jmp     %%cbc_dec_done
+
+%%final_num_blocks_is_9:
+        FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 9, zBLK_0_3, zBLK_4_7, \
+                     zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+                     %%TMP, %%NROUNDS
+        jmp     %%cbc_dec_done
+
+%%final_num_blocks_is_8:
+        FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 8, zBLK_0_3, zBLK_4_7, \
+                     zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+                     %%TMP, %%NROUNDS
+        jmp     %%cbc_dec_done
+
+%%final_num_blocks_is_7:
+        FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 7, zBLK_0_3, zBLK_4_7, \
+                     zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+                     %%TMP, %%NROUNDS
+        jmp     %%cbc_dec_done
+
+%%final_num_blocks_is_6:
+        FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 6, zBLK_0_3, zBLK_4_7, \
+                     zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+                     %%TMP, %%NROUNDS
+        jmp     %%cbc_dec_done
+
+%%final_num_blocks_is_5:
+        FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 5, zBLK_0_3, zBLK_4_7, \
+                     zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+                     %%TMP, %%NROUNDS
+        jmp     %%cbc_dec_done
+
+%%final_num_blocks_is_4:
+        FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 4, zBLK_0_3, zBLK_4_7, \
+                     zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+                     %%TMP, %%NROUNDS
+        jmp     %%cbc_dec_done
+
+%%final_num_blocks_is_3:
+        FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 3, zBLK_0_3, zBLK_4_7, \
+                     zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+                     %%TMP, %%NROUNDS
+        jmp     %%cbc_dec_done
+
+%%final_num_blocks_is_2:
+        FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 2, zBLK_0_3, zBLK_4_7, \
+                     zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+                     %%TMP, %%NROUNDS
+        jmp     %%cbc_dec_done
+
+%%final_num_blocks_is_1:
+        FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 1, zBLK_0_3, zBLK_4_7, \
+                     zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+                     %%TMP, %%NROUNDS
+
+%%cbc_dec_done:
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+section .text
+
+%ifndef CBCS
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_128_vaes_avx512(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+mk_global aes_cbc_dec_128_vaes_avx512,function,internal
+aes_cbc_dec_128_vaes_avx512:
+        endbranch
+%ifidn __OUTPUT_FORMAT__, win64
+        mov     num_bytes, [rsp + 8*5]
+%endif
+        AES_CBC_DEC p_in, p_out, p_keys, p_IV, num_bytes, 9, tmp
+
+        ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_192_vaes_avx512(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+mk_global aes_cbc_dec_192_vaes_avx512,function,internal
+aes_cbc_dec_192_vaes_avx512:
+        endbranch
+%ifidn __OUTPUT_FORMAT__, win64
+        mov     num_bytes, [rsp + 8*5]
+%endif
+        AES_CBC_DEC p_in, p_out, p_keys, p_IV, num_bytes, 11, tmp
+
+        ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_256_vaes_avx512(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+mk_global aes_cbc_dec_256_vaes_avx512,function,internal
+aes_cbc_dec_256_vaes_avx512:
+        endbranch
+%ifidn __OUTPUT_FORMAT__, win64
+        mov     num_bytes, [rsp + 8*5]
+%endif
+        AES_CBC_DEC p_in, p_out, p_keys, p_IV, num_bytes, 13, tmp
+
+        ret
+
+%endif ;; CBCS
+
+%else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_aes_cbc_dec_256_vaes_avx512
+no_aes_cbc_dec_256_vaes_avx512:
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm
new file mode 100644
index 000000000..a7fbf39b4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm
@@ -0,0 +1,137 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 128 bit CBC AES encrypt
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_128_x4(void      *in,
+;;                        uint8_t   *IV,
+;;                        uint8_t   *keys,
+;;                        void      *out,
+;;                        uint64_t   len_bytes);
+; arg 1: IN:   pointer to input (cipher text)
+; arg 2: IV:   pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT:  pointer to output (plain text)
+; arg 5: LEN:  length in bytes (multiple of 16)
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0		rdi
+%define IN		rdi
+%define IV		rsi
+%define KEYS	rdx
+%define OUT 	rcx
+%define LEN		r8
+%define KEYS0	rdx
+%define OUT0	rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0		rcx
+%define IN		rcx
+%define IV		rdx
+%define KEYS0	r8
+%define OUT0	r9
+%define KEYS	r8
+%define OUT		r9
+%define LEN		r10
+%define PS		8
+%define stack_size	10*16 + 1*8	; must be an odd multiple of 8
+%define arg(x)		[rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	end_prolog
+	mov	LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	add	rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 11
+%define XMM_USAGE    (16)
+%define UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+%define EARLY_BLOCKS (2)
+
+; instruction set specific operation definitions
+%define MOVDQ         movdqu
+%define PXOR          pxor
+%define AES_ENC       aesenc
+%define AES_ENC_LAST  aesenclast
+
+%include "cbc_common.asm"
+
+
+mk_global aes_cbc_enc_128_x4, function
+func(aes_cbc_enc_128_x4)
+	endbranch
+	FUNC_SAVE
+
+	mov	IDX, 0
+	FILL_KEY_CACHE	 CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+	CBC_ENC_INIT	 FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+	CBC_ENC_SUBLOOP	 KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+	jne	main_loop
+
+done:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm
new file mode 100644
index 000000000..24ab33fe5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm
@@ -0,0 +1,151 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 128 bit CBC AES encrypt
+;; clobbers all registers except for ARG1 and rbp
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_256_x8(void      *in,
+;;                        uint8_t   *IV,
+;;                        uint8_t    keys,
+;;                        void      *out,
+;;                        uint64_t   len_bytes);
+; arg 1: IN:   pointer to input (cipher text)
+; arg 2: IV:   pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT:  pointer to output (plain text)
+; arg 5: LEN:  length in bytes (multiple of 16)
+;; clobbers all registers except for ARG1 and rbp
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0		rdi
+%define IN		rdi
+%define IV		rsi
+%define KEYS	rdx
+%define OUT 	rcx
+%define LEN		r8
+%define KEYS0	rdx
+%define OUT0	rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0		rcx
+%define IN		rcx
+%define IV		rdx
+%define KEYS0	r8
+%define OUT0	r9
+%define KEYS	r8
+%define OUT		r9
+%define LEN		r10
+%define PS		8
+%define stack_size	10*16 + 1*8	; must be an odd multiple of 8
+%define arg(x)		[rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	vmovdqa	[rsp + 0*16], xmm6
+	vmovdqa	[rsp + 1*16], xmm7
+	vmovdqa	[rsp + 2*16], xmm8
+	vmovdqa	[rsp + 3*16], xmm9
+	vmovdqa	[rsp + 4*16], xmm10
+	vmovdqa	[rsp + 5*16], xmm11
+	vmovdqa	[rsp + 6*16], xmm12
+	vmovdqa	[rsp + 7*16], xmm13
+	vmovdqa	[rsp + 8*16], xmm14
+	vmovdqa	[rsp + 9*16], xmm15
+	end_prolog
+	mov	LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp + 0*16]
+	vmovdqa	xmm7, [rsp + 1*16]
+	vmovdqa	xmm8, [rsp + 2*16]
+	vmovdqa	xmm9, [rsp + 3*16]
+	vmovdqa	xmm10, [rsp + 4*16]
+	vmovdqa	xmm11, [rsp + 5*16]
+	vmovdqa	xmm12, [rsp + 6*16]
+	vmovdqa	xmm13, [rsp + 7*16]
+	vmovdqa	xmm14, [rsp + 8*16]
+	vmovdqa	xmm15, [rsp + 9*16]
+	add	rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 11
+%define XMM_USAGE    (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+%define IV_CNT          (1)
+
+; instruction set specific operation definitions
+%define MOVDQ         vmovdqu
+%macro PXOR 2
+   vpxor %1, %1, %2
+%endm
+
+%macro AES_ENC 2
+  vaesenc %1, %1, %2
+%endm
+
+%macro AES_ENC_LAST 2
+  vaesenclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+
+mk_global aes_cbc_enc_128_x8, function
+func(aes_cbc_enc_128_x8)
+	endbranch
+	FUNC_SAVE
+
+	mov	IDX, 0
+
+	FILL_KEY_CACHE	 CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+	CBC_ENC_INIT	 FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+	CBC_ENC_SUBLOOP	 KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT,	FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+	jne	main_loop
+
+done:
+
+	FUNC_RESTORE
+	ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm
new file mode 100644
index 000000000..b3d80e922
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm
@@ -0,0 +1,149 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 192 bit CBC AES encrypt
+;;; Updates In and Out pointers at end
+
+;include "mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%define	MOVDQ movdqu ;; assume buffers not aligned
+%macro pxor2 2
+	MOVDQ	XTMP, %2
+	pxor	%1, XTMP
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_192_x4(void      *in,
+;;                        uint8_t   *IV,
+;;                        uint8_t    keys,
+;;                        void      *out,
+;;                        uint64_t   len_bytes);
+; arg 1: IN:   pointer to input (cipher text)
+; arg 2: IV:   pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT:  pointer to output (plain text)
+; arg 5: LEN:  length in bytes (multiple of 16)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0		rdi
+%define IN		rdi
+%define IV		rsi
+%define KEYS	rdx
+%define OUT 	rcx
+%define LEN		r8
+%define KEYS0	rdx
+%define OUT0	rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0		rcx
+%define IN		rcx
+%define IV		rdx
+%define KEYS0	r8
+%define OUT0	r9
+%define KEYS	r8
+%define OUT		r9
+%define LEN		r10
+%define PS		8
+%define stack_size	10*16 + 1*8	; must be an odd multiple of 8
+%define arg(x)		[rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	end_prolog
+	mov	LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	add	rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 13
+%define XMM_USAGE    (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+
+; instruction set specific operation definitions
+%define MOVDQ         movdqu
+%define PXOR          pxor
+%define AES_ENC       aesenc
+%define AES_ENC_LAST  aesenclast
+
+%include "cbc_common.asm"
+
+
+mk_global aes_cbc_enc_192_x4, function
+func(aes_cbc_enc_192_x4)
+	endbranch
+	FUNC_SAVE
+
+	mov	IDX, 0
+
+	FILL_KEY_CACHE	 CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+	CBC_ENC_INIT	 FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+	CBC_ENC_SUBLOOP	 KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT,	FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+	jne	main_loop
+
+done:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm
new file mode 100644
index 000000000..89d233819
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm
@@ -0,0 +1,147 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 192 bit CBC AES encrypt
+;; clobbers all registers except for ARG1 and rbp
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_192_x8(void      *in,
+;;                        uint8_t   *IV,
+;;                        uint8_t    keys,
+;;                        void      *out,
+;;                        uint64_t   len_bytes);
+; arg 1: IN:   pointer to input (cipher text)
+; arg 2: IV:   pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT:  pointer to output (plain text)
+; arg 5: LEN:  length in bytes (multiple of 16)
+;; clobbers all registers except for ARG1 and rbp
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0		rdi
+%define IN		rdi
+%define IV		rsi
+%define KEYS	rdx
+%define OUT 	rcx
+%define LEN		r8
+%define KEYS0	rdx
+%define OUT0	rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0		rcx
+%define IN		rcx
+%define IV		rdx
+%define KEYS0	r8
+%define OUT0	r9
+%define KEYS	r8
+%define OUT		r9
+%define LEN		r10
+%define PS		8
+%define stack_size	10*16 + 1*8	; must be an odd multiple of 8
+%define arg(x)		[rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	vmovdqa	[rsp + 0*16], xmm6
+	vmovdqa	[rsp + 1*16], xmm7
+	vmovdqa	[rsp + 2*16], xmm8
+	vmovdqa	[rsp + 3*16], xmm9
+	vmovdqa	[rsp + 4*16], xmm10
+	vmovdqa	[rsp + 5*16], xmm11
+	vmovdqa	[rsp + 6*16], xmm12
+	vmovdqa	[rsp + 7*16], xmm13
+	vmovdqa	[rsp + 8*16], xmm14
+	vmovdqa	[rsp + 9*16], xmm15
+	end_prolog
+	mov	LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp + 0*16]
+	vmovdqa	xmm7, [rsp + 1*16]
+	vmovdqa	xmm8, [rsp + 2*16]
+	vmovdqa	xmm9, [rsp + 3*16]
+	vmovdqa	xmm10, [rsp + 4*16]
+	vmovdqa	xmm11, [rsp + 5*16]
+	vmovdqa	xmm12, [rsp + 6*16]
+	vmovdqa	xmm13, [rsp + 7*16]
+	vmovdqa	xmm14, [rsp + 8*16]
+	vmovdqa	xmm15, [rsp + 9*16]
+	add	rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 13
+%define XMM_USAGE    (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+
+; instruction set specific operation definitions
+%define MOVDQ         vmovdqu
+%macro PXOR 2
+   vpxor %1, %1, %2
+%endm
+
+%macro AES_ENC 2
+  vaesenc %1, %1, %2
+%endm
+
+%macro AES_ENC_LAST 2
+  vaesenclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+mk_global aes_cbc_enc_192_x8, function
+func(aes_cbc_enc_192_x8)
+	endbranch
+	FUNC_SAVE
+
+	mov	IDX, 0
+
+	FILL_KEY_CACHE	 CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+	CBC_ENC_INIT	 FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+	CBC_ENC_SUBLOOP	 KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT,	FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+	jne	main_loop
+
+done:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm
new file mode 100644
index 000000000..ab37668c7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm
@@ -0,0 +1,141 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 256 bit CBC AES encrypt
+;;; Updates In and Out pointers at end
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_256_x4(void      *in,
+;;                        uint8_t   *IV,
+;;                        uint8_t    keys,
+;;                        void      *out,
+;;                        uint64_t   len_bytes);
+; arg 1: IN:   pointer to input (cipher text)
+; arg 2: IV:   pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT:  pointer to output (plain text)
+; arg 5: LEN:  length in bytes (multiple of 16)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0		rdi
+%define IN		rdi
+%define IV		rsi
+%define KEYS	rdx
+%define OUT 	rcx
+%define LEN		r8
+%define KEYS0	rdx
+%define OUT0	rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0		rcx
+%define IN		rcx
+%define IV		rdx
+%define KEYS0	r8
+%define OUT0	r9
+%define KEYS	r8
+%define OUT		r9
+%define LEN		r10
+%define PS		8
+%define stack_size	10*16 + 1*8	; must be an odd multiple of 8
+%define arg(x)		[rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	end_prolog
+	mov	LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	add	rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 15
+%define XMM_USAGE    (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+
+; instruction set specific operation definitions
+%define MOVDQ         movdqu
+%define PXOR          pxor
+%define AES_ENC       aesenc
+%define AES_ENC_LAST  aesenclast
+
+%include "cbc_common.asm"
+
+
+mk_global aes_cbc_enc_256_x4, function
+func(aes_cbc_enc_256_x4)
+	endbranch
+	FUNC_SAVE
+
+	mov	IDX, 0
+
+	FILL_KEY_CACHE	 CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+	CBC_ENC_INIT	 FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+	CBC_ENC_SUBLOOP	 KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT,	FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+	jne	main_loop
+
+done:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm
new file mode 100644
index 000000000..83e53ac11
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm
@@ -0,0 +1,148 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 256 bit CBC AES encrypt
+;; clobbers all registers except for ARG1 and rbp
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_256_x4(void      *in,
+;;                        uint8_t   *IV,
+;;                        uint8_t    keys,
+;;                        void      *out,
+;;                        uint64_t   len_bytes);
+; arg 1: IN:   pointer to input (cipher text)
+; arg 2: IV:   pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT:  pointer to output (plain text)
+; arg 5: LEN:  length in bytes (multiple of 16)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0		rdi
+%define IN		rdi
+%define IV		rsi
+%define KEYS	rdx
+%define OUT 	rcx
+%define LEN		r8
+%define KEYS0	rdx
+%define OUT0	rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0		rcx
+%define IN		rcx
+%define IV		rdx
+%define KEYS0	r8
+%define OUT0	r9
+%define KEYS	r8
+%define OUT		r9
+%define LEN		r10
+%define PS		8
+%define stack_size	10*16 + 1*8	; must be an odd multiple of 8
+%define arg(x)		[rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	vmovdqa	[rsp + 0*16], xmm6
+	vmovdqa	[rsp + 1*16], xmm7
+	vmovdqa	[rsp + 2*16], xmm8
+	vmovdqa	[rsp + 3*16], xmm9
+	vmovdqa	[rsp + 4*16], xmm10
+	vmovdqa	[rsp + 5*16], xmm11
+	vmovdqa	[rsp + 6*16], xmm12
+	vmovdqa	[rsp + 7*16], xmm13
+	vmovdqa	[rsp + 8*16], xmm14
+	vmovdqa	[rsp + 9*16], xmm15
+	end_prolog
+	mov	LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp + 0*16]
+	vmovdqa	xmm7, [rsp + 1*16]
+	vmovdqa	xmm8, [rsp + 2*16]
+	vmovdqa	xmm9, [rsp + 3*16]
+	vmovdqa	xmm10, [rsp + 4*16]
+	vmovdqa	xmm11, [rsp + 5*16]
+	vmovdqa	xmm12, [rsp + 6*16]
+	vmovdqa	xmm13, [rsp + 7*16]
+	vmovdqa	xmm14, [rsp + 8*16]
+	vmovdqa	xmm15, [rsp + 9*16]
+	add	rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 15
+%define XMM_USAGE    (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+
+; instruction set specific operation definitions
+%define MOVDQ         vmovdqu
+%macro PXOR 2
+   vpxor %1, %1, %2
+%endm
+
+%macro AES_ENC 2
+  vaesenc %1, %1, %2
+%endm
+
+%macro AES_ENC_LAST 2
+  vaesenclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+
+mk_global aes_cbc_enc_256_x8, function
+func(aes_cbc_enc_256_x8)
+	endbranch
+	FUNC_SAVE
+
+	mov	IDX, 0
+
+	FILL_KEY_CACHE	 CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+	CBC_ENC_INIT	 FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+	CBC_ENC_SUBLOOP	 KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT,	FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+	jne	main_loop
+
+done:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm
new file mode 100644
index 000000000..0cc09afe1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm
@@ -0,0 +1,102 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+default rel
+[bits 64]
+
+extern aes_cbc_dec_128_sse
+extern aes_cbc_dec_128_avx
+extern aes_cbc_dec_192_sse
+extern aes_cbc_dec_192_avx
+extern aes_cbc_dec_256_sse
+extern aes_cbc_dec_256_avx
+
+extern aes_cbc_enc_128_x4
+extern aes_cbc_enc_128_x8
+extern aes_cbc_enc_192_x4
+extern aes_cbc_enc_192_x8
+extern aes_cbc_enc_256_x4
+extern aes_cbc_enc_256_x8
+
+%if (AS_FEATURE_LEVEL) >= 10
+extern aes_cbc_dec_128_vaes_avx512
+extern aes_cbc_dec_192_vaes_avx512
+extern aes_cbc_dec_256_vaes_avx512
+%endif
+
+%include "multibinary.asm"
+
+;;;;
+; instantiate aesni_cbc interfaces enc and dec
+;;;;
+mbin_interface      aes_cbc_dec_128
+mbin_dispatch_init7 aes_cbc_dec_128, \
+	aes_cbc_dec_128_sse, \
+	aes_cbc_dec_128_sse, \
+	aes_cbc_dec_128_avx, \
+	aes_cbc_dec_128_avx, \
+	aes_cbc_dec_128_avx, \
+	aes_cbc_dec_128_vaes_avx512
+
+mbin_interface      aes_cbc_dec_192
+mbin_dispatch_init7 aes_cbc_dec_192, \
+	aes_cbc_dec_192_sse, \
+	aes_cbc_dec_192_sse, \
+	aes_cbc_dec_192_avx, \
+	aes_cbc_dec_192_avx, \
+	aes_cbc_dec_192_avx, \
+	aes_cbc_dec_192_vaes_avx512
+
+mbin_interface      aes_cbc_dec_256
+mbin_dispatch_init7 aes_cbc_dec_256, \
+	aes_cbc_dec_256_sse, \
+	aes_cbc_dec_256_sse, \
+	aes_cbc_dec_256_avx, \
+	aes_cbc_dec_256_avx, \
+	aes_cbc_dec_256_avx, \
+	aes_cbc_dec_256_vaes_avx512
+
+mbin_interface     aes_cbc_enc_128
+mbin_dispatch_init aes_cbc_enc_128, aes_cbc_enc_128_x4, aes_cbc_enc_128_x8, aes_cbc_enc_128_x8
+mbin_interface     aes_cbc_enc_192
+mbin_dispatch_init aes_cbc_enc_192, aes_cbc_enc_192_x4, aes_cbc_enc_192_x8, aes_cbc_enc_192_x8
+mbin_interface     aes_cbc_enc_256
+mbin_dispatch_init aes_cbc_enc_256, aes_cbc_enc_256_x4, aes_cbc_enc_256_x8, aes_cbc_enc_256_x8
+
+
+
+;;;       func            		core, ver, snum
+slversion aes_cbc_enc_128,		00,   00,  0291
+slversion aes_cbc_dec_128,		00,   00,  0292
+slversion aes_cbc_enc_192,		00,   00,  0293
+slversion aes_cbc_dec_192,		00,   00,  0294
+slversion aes_cbc_enc_256,		00,   00,  0295
+slversion aes_cbc_dec_256,		00,   00,  0296
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c
new file mode 100644
index 000000000..7ae5c9078
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c
@@ -0,0 +1,339 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>		// for rand
+#include <string.h>		// for memcmp
+#include <aes_cbc.h>
+#include <test.h>
+#include "ossl_helper.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     (2 * GT_L3_CACHE)
+#  define TEST_LOOPS   50
+#  define TEST_TYPE_STR "_cold"
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static unsigned char const ic[] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
+	0x0e, 0x0f
+};
+
+static unsigned char *plaintext, *cbc_plaintext, *cyphertext, *ossl_plaintext,
+    *ossl_cyphertext;
+static uint8_t test_key[CBC_256_BITS];
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+	unsigned int i;
+	for (i = 0; i < size; i++) {
+		*data++ = rand();
+	}
+}
+
+int aes_128_perf(uint8_t * key)
+{
+	int i, ret;
+
+	/* Initialize our cipher context, which can use same input vectors */
+	uint8_t *iv = NULL;
+	struct cbc_key_data *key_data = NULL;
+
+	ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+	if (ret) {
+		printf("alloc error: Fail");
+		return 1;
+	}
+	ret = posix_memalign((void **)&key_data, 16, (sizeof(*key_data)));
+	if (ret) {
+		printf("alloc error: Fail");
+		return 1;
+	}
+	if ((NULL == iv) || (NULL == key_data))
+		return 1;
+
+	memcpy(iv, ic, CBC_IV_DATA_LEN);
+
+	aes_cbc_precomp(key, 128, key_data);
+	aes_cbc_enc_128(plaintext, iv, key_data->enc_keys, cyphertext, TEST_LEN);
+	openssl_aes_128_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			aes_cbc_enc_128(plaintext, iv, key_data->enc_keys,
+					plaintext, TEST_LEN);
+		}
+
+		perf_stop(&stop);
+		printf("ISA-L__aes_cbc_128_encode" TEST_TYPE_STR ":  ");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			openssl_aes_128_cbc_enc(key, iv, TEST_LEN, plaintext, plaintext);
+		}
+
+		perf_stop(&stop);
+		printf("OpenSSL_aes_cbc_128_encode" TEST_TYPE_STR ": ");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			aes_cbc_dec_128(cyphertext, iv, key_data->dec_keys,
+					cbc_plaintext, TEST_LEN);
+		}
+
+		perf_stop(&stop);
+		printf("ISA-L__aes_cbc_128_decode" TEST_TYPE_STR ":  ");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			openssl_aes_128_cbc_dec(key, iv, TEST_LEN,
+						ossl_cyphertext, ossl_plaintext);
+		}
+
+		perf_stop(&stop);
+		printf("OpenSSL_aes_cbc_128_decode" TEST_TYPE_STR ": ");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+	printf("\n");
+	return 0;
+}
+
+int aes_192_perf(uint8_t * key)
+{
+	int i, ret;
+	uint8_t *iv = NULL;
+	struct cbc_key_data *key_data = NULL;
+
+	ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+	if (ret) {
+		printf("alloc error: Fail");
+		return 1;
+	}
+	ret = posix_memalign((void **)&key_data, 16, (sizeof(*key_data)));
+	if (ret) {
+		printf("alloc error: Fail");
+		return 1;
+	}
+	if ((NULL == iv) || (NULL == key_data))
+		return 1;
+
+	memcpy(iv, ic, CBC_IV_DATA_LEN);
+	aes_cbc_precomp(key, 192, key_data);
+	aes_cbc_enc_192(plaintext, iv, key_data->enc_keys, cyphertext, TEST_LEN);
+	openssl_aes_192_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			aes_cbc_enc_192(plaintext, iv, key_data->enc_keys,
+					cyphertext, TEST_LEN);
+		}
+
+		perf_stop(&stop);
+		printf("ISA-L__aes_cbc_192_encode" TEST_TYPE_STR ":  ");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			openssl_aes_192_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+		}
+
+		perf_stop(&stop);
+		printf("OpenSSL_aes_cbc_192_encode" TEST_TYPE_STR ": ");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			aes_cbc_dec_192(cyphertext, iv, key_data->dec_keys,
+					cbc_plaintext, TEST_LEN);
+		}
+
+		perf_stop(&stop);
+		printf("ISA-L__aes_cbc_192_decode" TEST_TYPE_STR ":  ");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			openssl_aes_192_cbc_dec(key, iv, TEST_LEN,
+						ossl_cyphertext, ossl_plaintext);
+		}
+
+		perf_stop(&stop);
+		printf("OpenSSL_aes_cbc_192_decode" TEST_TYPE_STR ": ");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+	printf("\n");
+	return 0;
+}
+
+int aes_256_perf(uint8_t * key)
+{
+	int i, ret;
+	uint8_t *iv = NULL;
+	struct cbc_key_data *key_data = NULL;
+
+	ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+	if (ret) {
+		printf("alloc error: Fail");
+		return 1;
+	}
+	ret = posix_memalign((void **)&key_data, 16, (sizeof(*key_data)));
+	if (ret) {
+		printf("alloc error: Fail");
+		return 1;
+	}
+	if ((NULL == iv) || (NULL == key_data))
+		return 1;
+
+	aes_cbc_precomp(key, 256, key_data);
+	memcpy(iv, ic, CBC_IV_DATA_LEN);
+	aes_cbc_enc_256(plaintext, iv, key_data->enc_keys, cyphertext, TEST_LEN);
+	openssl_aes_256_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			aes_cbc_enc_256(plaintext, iv, key_data->enc_keys,
+					cyphertext, TEST_LEN);
+		}
+
+		perf_stop(&stop);
+		printf("ISA-L__aes_cbc_256_encode" TEST_TYPE_STR ":  ");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			openssl_aes_256_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+		}
+
+		perf_stop(&stop);
+		printf("OpenSSL_aes_cbc_256_encode" TEST_TYPE_STR ": ");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			aes_cbc_dec_256(cyphertext, iv, key_data->dec_keys,
+					cbc_plaintext, TEST_LEN);
+		}
+
+		perf_stop(&stop);
+		printf("ISA-L__aes_cbc_256_decode" TEST_TYPE_STR ":  ");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			openssl_aes_256_cbc_dec(key, iv, TEST_LEN,
+						ossl_cyphertext, ossl_plaintext);
+		}
+
+		perf_stop(&stop);
+		printf("OpenSSL_aes_cbc_256_decode" TEST_TYPE_STR ": ");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+	printf("\n");
+	return 0;
+}
+
+int main(void)
+{
+	uint32_t OK = 0;
+
+	srand(TEST_SEED);
+
+	plaintext = malloc(TEST_LEN);
+	cbc_plaintext = malloc(TEST_LEN);
+	cyphertext = malloc(TEST_LEN);
+	ossl_plaintext = malloc(TEST_LEN);
+	ossl_cyphertext = malloc(TEST_LEN);
+	if (NULL == plaintext || NULL == cyphertext || NULL == cbc_plaintext
+	    || NULL == ossl_plaintext || NULL == ossl_cyphertext) {
+		printf("malloc of testsize:0x%x failed\n", TEST_LEN);
+		return 1;
+	}
+
+	mk_rand_data(plaintext, TEST_LEN);
+	mk_rand_data(test_key, sizeof(test_key));
+	printf("AES CBC ISA-L vs OpenSSL performance:\n");
+	OK += aes_128_perf(test_key);
+	OK += aes_192_perf(test_key);
+	OK += aes_256_perf(test_key);
+
+	return OK;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c
new file mode 100644
index 000000000..8e8f41792
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c
@@ -0,0 +1,56 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <aes_cbc.h>
+#include <aes_keyexp.h>
+
+int aes_cbc_precomp(uint8_t * key, int key_size, struct cbc_key_data *keys_blk)
+{
+	if (CBC_128_BITS == key_size) {
+		aes_keyexp_128(key, keys_blk->enc_keys, keys_blk->dec_keys);
+	} else if (CBC_192_BITS == key_size) {
+		aes_keyexp_192(key, keys_blk->enc_keys, keys_blk->dec_keys);
+	} else if (CBC_256_BITS == key_size) {
+		aes_keyexp_256(key, keys_blk->enc_keys, keys_blk->dec_keys);
+	} else {
+		//Invalid key length
+		return 1;
+	}
+	return 0;
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+
+// Version info
+struct slver aes_cbc_precomp_slver_00000297;
+struct slver aes_cbc_precomp_slver = { 0x0297, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h
new file mode 100644
index 000000000..7bebcaed4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h
@@ -0,0 +1,466 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef AES_CBC_STD_VECTORS_H_
+#define AES_CBC_STD_VECTORS_H_
+#include <aes_cbc.h>
+
+
+// struct to hold pointers to the cbc data vectors
+struct cbc_vector {
+	uint8_t*             K;           // AES Key
+	cbc_key_size         K_LEN;       // length of key in bits
+	uint8_t*             IV;          // initial value used by GCM
+	uint64_t             P_LEN;       // length of our plaintext
+	uint8_t*             P;           // Plain text
+	//outputs of encryption
+	uint8_t*             EXP_C;       // same length as P
+	// used in vector checks, not populated in std vector array
+	uint8_t             *C;
+	struct cbc_key_data *KEYS;
+};
+
+
+///////////////////////////////////////////
+// Test vectors from:
+// Intel IPSec library 1..3
+//
+///////////////////////////////////////////
+static unsigned char K1[] = {
+	0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c
+};
+static unsigned char IV1[] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+};
+static unsigned char P1[] = {
+	0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a,
+	0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51,
+	0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef,
+	0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10
+};
+static unsigned char C1[] = {
+	0x76, 0x49, 0xab, 0xac, 0x81, 0x19, 0xb2, 0x46, 0xce, 0xe9, 0x8e, 0x9b, 0x12, 0xe9, 0x19, 0x7d,
+	0x50, 0x86, 0xcb, 0x9b, 0x50, 0x72, 0x19, 0xee, 0x95, 0xdb, 0x11, 0x3a, 0x91, 0x76, 0x78, 0xb2,
+	0x73, 0xbe, 0xd6, 0xb8, 0xe3, 0xc1, 0x74, 0x3b, 0x71, 0x16, 0xe6, 0x9e, 0x22, 0x22, 0x95, 0x16,
+	0x3f, 0xf1, 0xca, 0xa1, 0x68, 0x1f, 0xac, 0x09, 0x12, 0x0e, 0xca, 0x30, 0x75, 0x86, 0xe1, 0xa7
+};
+
+static unsigned char K2[] = {
+	0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81,
+	0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4
+};
+static unsigned char IV2[] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+};
+static unsigned char P2[] = {
+	0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a,
+	0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51,
+	0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef,
+	0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10
+};
+static unsigned char C2[] = {
+	0xf5, 0x8c, 0x4c, 0x04, 0xd6, 0xe5, 0xf1, 0xba, 0x77, 0x9e, 0xab, 0xfb, 0x5f, 0x7b, 0xfb, 0xd6,
+	0x9c, 0xfc, 0x4e, 0x96, 0x7e, 0xdb, 0x80, 0x8d, 0x67, 0x9f, 0x77, 0x7b, 0xc6, 0x70, 0x2c, 0x7d,
+	0x39, 0xf2, 0x33, 0x69, 0xa9, 0xd9, 0xba, 0xcf, 0xa5, 0x30, 0xe2, 0x63, 0x04, 0x23, 0x14, 0x61,
+	0xb2, 0xeb, 0x05, 0xe2, 0xc3, 0x9b, 0xe9, 0xfc, 0xda, 0x6c, 0x19, 0x07, 0x8c, 0x6a, 0x9d, 0x1b
+};
+
+static unsigned char K3[] = {
+		0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81,
+		0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7
+};
+static unsigned char IV3[] = {
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+};
+static unsigned char P3[] = {
+		0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a,
+		0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51,
+		0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef,
+		0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10
+};
+static unsigned char C3[] = {
+		0x17, 0x70, 0x1a, 0x9d, 0x29, 0xc9, 0x1a, 0x94, 0xce, 0xed, 0x72, 0x3c, 0x34, 0xe8,
+		0x7a, 0xbe, 0x1c, 0x96, 0x84, 0x5c, 0xa8, 0xb7, 0xe8, 0x58, 0x6d, 0xfe, 0xf2, 0xfa,
+		0x6b, 0xed, 0x24, 0x09, 0x8a, 0x52, 0xce, 0xe8, 0xd7, 0x6d, 0xb6, 0x7b, 0xfd, 0xe2,
+		0x15, 0x53, 0xd3, 0x1c, 0x28, 0x33, 0xf7, 0x7e, 0xb5, 0x95, 0x00, 0xac, 0x49, 0x03,
+		0xbc, 0x70, 0x76, 0xb1, 0x84, 0x65, 0xd0, 0xea
+};
+
+///////////////////////////////////////////
+// Test vectors from:
+// 'https://tools.ietf.org/html/rfc3602#section-3.2'
+//  The AES-CBC Cipher Algorithm and Its Use with IPsec
+//
+///////////////////////////////////////////
+/*
+Case #1: Encrypting 16 bytes (1 block) using AES-CBC with 128-bit key
+Key       : 0x06a9214036b8a15b512e03d534120006
+IV        : 0x3dafba429d9eb430b422da802c9fac41
+Plaintext : "Single block msg"
+Ciphertext: 0xe353779c1079aeb82708942dbe77181a
+ *
+ */
+static unsigned char K4[] = {
+		0x06, 0xa9, 0x21, 0x40, 0x36, 0xb8, 0xa1, 0x5b, 0x51, 0x2e, 0x03, 0xd5, 0x34, 0x12, 0x00, 0x06
+};
+static unsigned char IV4[] = {
+		0x3d, 0xaf, 0xba, 0x42, 0x9d, 0x9e, 0xb4, 0x30, 0xb4, 0x22, 0xda, 0x80, 0x2c, 0x9f, 0xac, 0x41
+};
+static unsigned char P4[] = {
+		"Single block msg"
+};
+static unsigned char C4[] = {
+		0xe3, 0x53, 0x77, 0x9c, 0x10, 0x79, 0xae, 0xb8, 0x27, 0x08, 0x94, 0x2d, 0xbe, 0x77, 0x18, 0x1a
+};
+
+/*
+Case #2: Encrypting 32 bytes (2 blocks) using AES-CBC with 128-bit key
+Key       : 0xc286696d887c9aa0611bbb3e2025a45a
+IV        : 0x562e17996d093d28ddb3ba695a2e6f58
+Plaintext : 0x000102030405060708090a0b0c0d0e0f
+              101112131415161718191a1b1c1d1e1f
+Ciphertext: 0xd296cd94c2cccf8a3a863028b5e1dc0a
+              7586602d253cfff91b8266bea6d61ab1
+*/
+static unsigned char K5[] = {
+		0xc2, 0x86, 0x69, 0x6d, 0x88, 0x7c, 0x9a, 0xa0, 0x61, 0x1b, 0xbb, 0x3e, 0x20, 0x25, 0xa4, 0x5a
+};
+static unsigned char IV5[] = {
+		0x56, 0x2e, 0x17, 0x99, 0x6d, 0x09, 0x3d, 0x28, 0xdd, 0xb3, 0xba, 0x69, 0x5a, 0x2e, 0x6f, 0x58
+};
+static unsigned char P5[] = {
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
+		0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b,
+		0x1c, 0x1d, 0x1e, 0x1f
+};
+static unsigned char C5[] = {
+		0xd2, 0x96, 0xcd, 0x94, 0xc2, 0xcc, 0xcf, 0x8a, 0x3a, 0x86, 0x30, 0x28, 0xb5, 0xe1,
+		0xdc, 0x0a, 0x75, 0x86, 0x60, 0x2d, 0x25, 0x3c, 0xff, 0xf9, 0x1b, 0x82, 0x66, 0xbe,
+		0xa6, 0xd6, 0x1a, 0xb1
+};
+
+/*
+Case #3: Encrypting 48 bytes (3 blocks) using AES-CBC with 128-bit key
+Key       : 0x6c3ea0477630ce21a2ce334aa746c2cd
+IV        : 0xc782dc4c098c66cbd9cd27d825682c81
+Plaintext : "This is a 48-byte message (exactly 3 AES blocks)"
+Ciphertext: 0xd0a02b3836451753d493665d33f0e886
+              2dea54cdb293abc7506939276772f8d5
+              021c19216bad525c8579695d83ba2684
+
+ */
+static unsigned char K6[] = {
+		0x6c, 0x3e, 0xa0, 0x47, 0x76, 0x30, 0xce, 0x21, 0xa2, 0xce, 0x33, 0x4a, 0xa7, 0x46, 0xc2, 0xcd
+};
+static unsigned char IV6[] = {
+		0xc7, 0x82, 0xdc, 0x4c, 0x09, 0x8c, 0x66, 0xcb, 0xd9, 0xcd, 0x27, 0xd8, 0x25, 0x68, 0x2c, 0x81
+};
+static unsigned char P6[] = {
+		"This is a 48-byte message (exactly 3 AES blocks)"
+};
+static unsigned char C6[] = {
+		0xd0, 0xa0, 0x2b, 0x38, 0x36, 0x45, 0x17, 0x53, 0xd4, 0x93, 0x66, 0x5d, 0x33, 0xf0, 0xe8, 0x86,
+		0x2d, 0xea, 0x54, 0xcd, 0xb2, 0x93, 0xab, 0xc7, 0x50, 0x69, 0x39, 0x27, 0x67, 0x72, 0xf8, 0xd5,
+		0x02, 0x1c, 0x19, 0x21, 0x6b, 0xad, 0x52, 0x5c, 0x85, 0x79, 0x69, 0x5d, 0x83, 0xba, 0x26, 0x84
+};
+
+/*
+Case #4: Encrypting 64 bytes (4 blocks) using AES-CBC with 128-bit key
+Key       : 0x56e47a38c5598974bc46903dba290349
+IV        : 0x8ce82eefbea0da3c44699ed7db51b7d9
+Plaintext : 0xa0a1a2a3a4a5a6a7a8a9aaabacadaeaf
+              b0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+              c0c1c2c3c4c5c6c7c8c9cacbcccdcecf
+              d0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+Ciphertext: 0xc30e32ffedc0774e6aff6af0869f71aa
+              0f3af07a9a31a9c684db207eb0ef8e4e
+              35907aa632c3ffdf868bb7b29d3d46ad
+              83ce9f9a102ee99d49a53e87f4c3da55
+ */
+static unsigned char K7[] = {
+		0x56, 0xe4, 0x7a, 0x38, 0xc5, 0x59, 0x89, 0x74, 0xbc, 0x46, 0x90, 0x3d, 0xba, 0x29, 0x03, 0x49
+};
+static unsigned char IV7[] = {
+		0x8c, 0xe8, 0x2e, 0xef, 0xbe, 0xa0, 0xda, 0x3c, 0x44, 0x69, 0x9e, 0xd7, 0xdb, 0x51, 0xb7, 0xd9
+};
+static unsigned char P7[] = {
+		0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+	    0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+		0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+		0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf
+};
+static unsigned char C7[] = {
+		0xc3, 0x0e, 0x32, 0xff, 0xed, 0xc0, 0x77, 0x4e, 0x6a, 0xff, 0x6a, 0xf0, 0x86, 0x9f, 0x71, 0xaa,
+		0x0f, 0x3a, 0xf0, 0x7a, 0x9a, 0x31, 0xa9, 0xc6, 0x84, 0xdb, 0x20, 0x7e, 0xb0, 0xef, 0x8e, 0x4e,
+		0x35, 0x90, 0x7a, 0xa6, 0x32, 0xc3, 0xff, 0xdf, 0x86, 0x8b, 0xb7, 0xb2, 0x9d, 0x3d, 0x46, 0xad,
+		0x83, 0xce, 0x9f, 0x9a, 0x10, 0x2e, 0xe9, 0x9d, 0x49, 0xa5, 0x3e, 0x87, 0xf4, 0xc3, 0xda, 0x55
+};
+
+/*
+Case #5: Sample transport-mode ESP packet (ping 192.168.123.100)
+Key: 90d382b4 10eeba7a d938c46c ec1a82bf
+SPI: 4321
+Source address: 192.168.123.3
+Destination address: 192.168.123.100
+Sequence number: 1
+IV: e96e8c08 ab465763 fd098d45 dd3ff893
+
+Original packet:
+IP header (20 bytes): 45000054 08f20000 4001f9fe c0a87b03 c0a87b64
+Data (64 bytes):
+08000ebd a70a0000 8e9c083d b95b0700 08090a0b 0c0d0e0f 10111213 14151617
+18191a1b 1c1d1e1f 20212223 24252627 28292a2b 2c2d2e2f 30313233 34353637
+
+Augment data with:
+Padding: 01020304 05060708 090a0b0c 0d0e
+Pad length: 0e
+Next header: 01 (ICMP)
+
+Pre-encryption Data with padding, pad length and next header (80 bytes):
+08000ebd a70a0000 8e9c083d b95b0700 08090a0b 0c0d0e0f 10111213 14151617
+18191a1b 1c1d1e1f 20212223 24252627 28292a2b 2c2d2e2f 30313233 34353637
+01020304 05060708 090a0b0c 0d0e0e01
+
+Post-encryption packet with SPI, Sequence number, IV:
+IP header: 4500007c 08f20000 4032f9a5 c0a87b03 c0a87b64
+SPI/Seq #: 00004321 00000001
+IV: e96e8c08 ab465763 fd098d45 dd3ff893
+Encrypted Data (80 bytes):
+f663c25d 325c18c6 a9453e19 4e120849 a4870b66 cc6b9965 330013b4 898dc856
+a4699e52 3a55db08 0b59ec3a 8e4b7e52 775b07d1 db34ed9c 538ab50c 551b874a
+a269add0 47ad2d59 13ac19b7 cfbad4a6
+*/
+static unsigned char K8[] = {
+		0x90, 0xd3, 0x82, 0xb4, 0x10, 0xee, 0xba, 0x7a, 0xd9, 0x38, 0xc4, 0x6c, 0xec, 0x1a, 0x82, 0xbf
+};
+static unsigned char IV8[] = {
+		0xe9, 0x6e, 0x8c, 0x08, 0xab, 0x46, 0x57, 0x63, 0xfd, 0x09, 0x8d, 0x45, 0xdd, 0x3f, 0xf8, 0x93
+};
+static unsigned char P8[] = {
+		0x08, 0x00, 0x0e, 0xbd, 0xa7, 0x0a, 0x00, 0x00, 0x8e, 0x9c, 0x08, 0x3d, 0xb9, 0x5b, 0x07, 0x00,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+		0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+		0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x01
+};
+static unsigned char C8[] = {
+		0xf6, 0x63, 0xc2, 0x5d, 0x32, 0x5c, 0x18, 0xc6, 0xa9, 0x45, 0x3e, 0x19, 0x4e, 0x12, 0x08, 0x49,
+		0xa4, 0x87, 0x0b, 0x66, 0xcc, 0x6b, 0x99, 0x65, 0x33, 0x00, 0x13, 0xb4, 0x89, 0x8d, 0xc8, 0x56,
+		0xa4, 0x69, 0x9e, 0x52, 0x3a, 0x55, 0xdb, 0x08, 0x0b, 0x59, 0xec, 0x3a, 0x8e, 0x4b, 0x7e, 0x52,
+		0x77, 0x5b, 0x07, 0xd1, 0xdb, 0x34, 0xed, 0x9c, 0x53, 0x8a, 0xb5, 0x0c, 0x55, 0x1b, 0x87, 0x4a,
+		0xa2, 0x69, 0xad, 0xd0, 0x47, 0xad, 0x2d, 0x59, 0x13, 0xac, 0x19, 0xb7, 0xcf, 0xba, 0xd4, 0xa6
+};
+
+/*
+Case #6: Sample transport-mode ESP packet
+         (ping -p 77 -s 20 192.168.123.100)
+Key: 90d382b4 10eeba7a d938c46c ec1a82bf
+SPI: 4321
+Source address: 192.168.123.3
+Destination address: 192.168.123.100
+Sequence number: 8
+IV: 69d08df7 d203329d b093fc49 24e5bd80
+
+Original packet:
+IP header (20 bytes): 45000030 08fe0000 4001fa16 c0a87b03 c0a87b64
+Data (28 bytes):
+0800b5e8 a80a0500 a69c083d 0b660e00 77777777 77777777 77777777
+
+Augment data with:
+Padding: 0102
+Pad length: 02
+Next header: 01 (ICMP)
+
+Pre-encryption Data with padding, pad length and next header (32 bytes):
+0800b5e8 a80a0500 a69c083d 0b660e00 77777777 77777777 77777777 01020201
+
+Post-encryption packet with SPI, Sequence number, IV:
+IP header: 4500004c 08fe0000 4032f9c9 c0a87b03 c0a87b64
+SPI/Seq #: 00004321 00000008
+IV: 69d08df7 d203329d b093fc49 24e5bd80
+Encrypted Data (32 bytes):
+f5199588 1ec4e0c4 488987ce 742e8109 689bb379 d2d750c0 d915dca3 46a89f75
+ */
+static unsigned char K9[] = {
+		0x90, 0xd3, 0x82, 0xb4, 0x10, 0xee, 0xba, 0x7a, 0xd9, 0x38, 0xc4, 0x6c, 0xec, 0x1a, 0x82, 0xbf
+};
+static unsigned char IV9[] = {
+		0x69, 0xd0, 0x8d, 0xf7, 0xd2, 0x03, 0x32, 0x9d, 0xb0, 0x93, 0xfc, 0x49, 0x24, 0xe5, 0xbd, 0x80
+};
+static unsigned char P9[] = {
+		0x08, 0x00, 0xb5, 0xe8, 0xa8, 0x0a, 0x05, 0x00, 0xa6, 0x9c, 0x08, 0x3d, 0x0b, 0x66, 0x0e, 0x00,
+		0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x01, 0x02, 0x02, 0x01
+};
+static unsigned char C9[] = {
+		0xf5, 0x19, 0x95, 0x88, 0x1e, 0xc4, 0xe0, 0xc4, 0x48, 0x89, 0x87, 0xce, 0x74, 0x2e, 0x81, 0x09,
+		0x68, 0x9b, 0xb3, 0x79, 0xd2, 0xd7, 0x50, 0xc0, 0xd9, 0x15, 0xdc, 0xa3, 0x46, 0xa8, 0x9f, 0x75
+};
+
+/*
+Case #7: Sample tunnel-mode ESP packet (ping 192.168.123.200)
+Key: 01234567 89abcdef 01234567 89abcdef
+SPI: 8765
+Source address: 192.168.123.3
+Destination address: 192.168.123.200
+Sequence number: 2
+IV: f4e76524 4f6407ad f13dc138 0f673f37
+
+Original packet:
+IP header (20 bytes): 45000054 09040000 4001f988 c0a87b03 c0a87bc8
+Data (64 bytes):
+08009f76 a90a0100 b49c083d 02a20400 08090a0b 0c0d0e0f 10111213 14151617
+18191a1b 1c1d1e1f 20212223 24252627 28292a2b 2c2d2e2f 30313233 34353637
+
+Augment data with:
+Padding: 01020304 05060708 090a
+Pad length: 0a
+Next header: 04 (IP-in-IP)
+
+Pre-encryption Data with original IP header, padding, pad length and
+                         next header (96 bytes):
+45000054 09040000 4001f988 c0a87b03 c0a87bc8 08009f76 a90a0100 b49c083d
+02a20400 08090a0b 0c0d0e0f 10111213 14151617 18191a1b 1c1d1e1f 20212223
+24252627 28292a2b 2c2d2e2f 30313233 34353637 01020304 05060708 090a0a04
+
+
+Post-encryption packet with SPI, Sequence number, IV:
+IP header: 4500008c 09050000 4032f91e c0a87b03 c0a87bc8
+SPI/Seq #: 00008765 00000002
+IV: f4e76524 4f6407ad f13dc138 0f673f37
+Encrypted Data (96 bytes):
+773b5241 a4c44922 5e4f3ce5 ed611b0c 237ca96c f74a9301 3c1b0ea1 a0cf70f8
+e4ecaec7 8ac53aad 7a0f022b 859243c6 47752e94 a859352b 8a4d4d2d ecd136e5
+c177f132 ad3fbfb2 201ac990 4c74ee0a 109e0ca1 e4dfe9d5 a100b842 f1c22f0d
+ */
+static unsigned char K10[] = {
+		0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef
+};
+static unsigned char IV10[] = {
+		0xf4, 0xe7, 0x65, 0x24, 0x4f, 0x64, 0x07, 0xad, 0xf1, 0x3d, 0xc1, 0x38, 0x0f, 0x67, 0x3f, 0x37
+};
+static unsigned char P10[] = {
+		0x45, 0x00, 0x00, 0x54, 0x09, 0x04, 0x00, 0x00, 0x40, 0x01, 0xf9, 0x88, 0xc0, 0xa8, 0x7b, 0x03,
+		0xc0, 0xa8, 0x7b, 0xc8, 0x08, 0x00, 0x9f, 0x76, 0xa9, 0x0a, 0x01, 0x00, 0xb4, 0x9c, 0x08, 0x3d,
+		0x02, 0xa2, 0x04, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13,
+		0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23,
+		0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33,
+		0x34, 0x35, 0x36, 0x37, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x04
+
+};
+static unsigned char C10[] = {
+		0x77, 0x3b, 0x52, 0x41, 0xa4, 0xc4, 0x49, 0x22, 0x5e, 0x4f, 0x3c, 0xe5, 0xed, 0x61, 0x1b, 0x0c,
+		0x23, 0x7c, 0xa9, 0x6c, 0xf7, 0x4a, 0x93, 0x01, 0x3c, 0x1b, 0x0e, 0xa1, 0xa0, 0xcf, 0x70, 0xf8,
+		0xe4, 0xec, 0xae, 0xc7, 0x8a, 0xc5, 0x3a, 0xad, 0x7a, 0x0f, 0x02, 0x2b, 0x85, 0x92, 0x43, 0xc6,
+		0x47, 0x75, 0x2e, 0x94, 0xa8, 0x59, 0x35, 0x2b, 0x8a, 0x4d, 0x4d, 0x2d, 0xec, 0xd1, 0x36, 0xe5,
+		0xc1, 0x77, 0xf1, 0x32, 0xad, 0x3f, 0xbf, 0xb2, 0x20, 0x1a, 0xc9, 0x90, 0x4c, 0x74, 0xee, 0x0a,
+		0x10, 0x9e, 0x0c, 0xa1, 0xe4, 0xdf, 0xe9, 0xd5, 0xa1, 0x00, 0xb8, 0x42, 0xf1, 0xc2, 0x2f, 0x0d
+};
+
+/*
+Case #8: Sample tunnel-mode ESP packet
+         (ping -p ff -s 40 192.168.123.200)
+Key: 01234567 89abcdef 01234567 89abcdef
+SPI: 8765
+Source address: 192.168.123.3
+Destination address: 192.168.123.200
+Sequence number: 5
+IV: 85d47224 b5f3dd5d 2101d4ea 8dffab22
+
+Original packet:
+IP header (20 bytes): 45000044 090c0000 4001f990 c0a87b03 c0a87bc8
+Data (48 bytes):
+0800d63c aa0a0200 c69c083d a3de0300 ffffffff ffffffff ffffffff ffffffff
+ffffffff ffffffff ffffffff ffffffff
+
+Augment data with:
+Padding: 01020304 05060708 090a
+Pad length: 0a
+Next header: 04 (IP-in-IP)
+
+Pre-encryption Data with original IP header, padding, pad length and
+                         next header (80 bytes):
+45000044 090c0000 4001f990 c0a87b03 c0a87bc8 0800d63c aa0a0200 c69c083d
+a3de0300 ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff
+ffffffff 01020304 05060708 090a0a04
+
+Post-encryption packet with SPI, Sequence number, IV:
+IP header: 4500007c 090d0000 4032f926 c0a87b03 c0a87bc8
+SPI/Seq #: 00008765 00000005
+IV: 85d47224 b5f3dd5d 2101d4ea 8dffab22
+Encrypted Data (80 bytes):
+15b92683 819596a8 047232cc 00f7048f e45318e1 1f8a0f62 ede3c3fc 61203bb5
+0f980a08 c9843fd3 a1b06d5c 07ff9639 b7eb7dfb 3512e5de 435e7207 ed971ef3
+d2726d9b 5ef6affc 6d17a0de cbb13892
+ */
+static unsigned char K11[] = {
+		0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef
+};
+static unsigned char IV11[] = {
+		0x85, 0xd4, 0x72, 0x24, 0xb5, 0xf3, 0xdd, 0x5d, 0x21, 0x01, 0xd4, 0xea, 0x8d, 0xff, 0xab, 0x22
+};
+static unsigned char P11[] = {
+		0x45, 0x00, 0x00, 0x44, 0x09, 0x0c, 0x00, 0x00, 0x40, 0x01, 0xf9, 0x90, 0xc0, 0xa8, 0x7b, 0x03,
+		0xc0, 0xa8, 0x7b, 0xc8, 0x08, 0x00, 0xd6, 0x3c, 0xaa, 0x0a, 0x02, 0x00, 0xc6, 0x9c, 0x08, 0x3d,
+		0xa3, 0xde, 0x03, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x04
+};
+static unsigned char C11[] = {
+		0x15, 0xb9, 0x26, 0x83, 0x81, 0x95, 0x96, 0xa8, 0x04, 0x72, 0x32, 0xcc, 0x00, 0xf7, 0x04, 0x8f,
+		0xe4, 0x53, 0x18, 0xe1, 0x1f, 0x8a, 0x0f, 0x62, 0xed, 0xe3, 0xc3, 0xfc, 0x61, 0x20, 0x3b, 0xb5,
+		0x0f, 0x98, 0x0a, 0x08, 0xc9, 0x84, 0x3f, 0xd3, 0xa1, 0xb0, 0x6d, 0x5c, 0x07, 0xff, 0x96, 0x39,
+		0xb7, 0xeb, 0x7d, 0xfb, 0x35, 0x12, 0xe5, 0xde, 0x43, 0x5e, 0x72, 0x07, 0xed, 0x97, 0x1e, 0xf3,
+		0xd2, 0x72, 0x6d, 0x9b, 0x5e, 0xf6, 0xaf, 0xfc, 0x6d, 0x17, 0xa0, 0xde, 0xcb, 0xb1, 0x38, 0x92
+};
+
+
+#define min_size(a, b) (((a)<(b))?(a):(b))
+// Plain and cypher text will be the same size
+// Those vectors using strings for plain text have an extra null terminator that needs
+// to be ignored
+#define vect_size(P, C) (min_size((sizeof(P)),(sizeof(C))))
+#define CBC_KEY_LEN(kdata) (sizeof(kdata))
+
+//field order {K, Klen, IV, Plen, P, C};
+#define vector(N) {K##N, (CBC_KEY_LEN(K##N)), IV##N, vect_size(P##N,C##N), P##N, C##N, NULL, NULL, /*NULL, NULL*/}
+struct cbc_vector const cbc_vectors[] = {
+	vector(1),
+	vector(2),
+	vector(3),
+	vector(4),
+	vector(5),
+	vector(6),
+	vector(7),
+	vector(8),
+	vector(9),
+	vector(10),
+	vector(11),
+};
+
+#endif /* AES_CBC_STD_VECTORS_H_ */
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c
new file mode 100644
index 000000000..aa9412c35
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c
@@ -0,0 +1,443 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <aes_cbc.h>
+#include "types.h"
+#include "ossl_helper.h"
+#include "cbc_std_vectors.h"
+
+//define CBC_VECTORS_VERBOSE
+//define CBC_VECTORS_EXTRA_VERBOSE
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#ifndef RANDOMS
+# define RANDOMS  100
+#endif
+#ifndef TEST_LEN
+# define TEST_LEN  (8*1024*1024)
+#endif
+#ifndef PAGE_LEN
+# define PAGE_LEN  (4*1024)
+#endif
+#ifndef MAX_UNALINED
+# define MAX_UNALINED  (16)
+#endif
+
+static cbc_key_size const Ksize[] = { CBC_128_BITS, CBC_192_BITS, CBC_256_BITS };
+
+typedef void (*aes_cbc_generic)(uint8_t * in,
+				uint8_t * IV,
+				uint8_t * keys, uint8_t * out, uint64_t len_bytes);
+
+int OpenSslEnc(uint8_t k_len,
+	       uint8_t * key, uint8_t * in, uint8_t * iv, uint8_t * out, uint64_t len_bytes)
+{
+	if (CBC_128_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+		printf(" OpenSSL128 ");
+#endif
+		openssl_aes_128_cbc_enc(key, (uint8_t *) iv, len_bytes, in, out);
+	} else if (CBC_192_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+		printf(" OpenSSL192 ");
+#endif
+		openssl_aes_192_cbc_enc(key, (uint8_t *) iv, len_bytes, in, out);
+	} else if (CBC_256_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+		printf(" OpenSSL256 ");
+		fflush(0);
+#endif
+		openssl_aes_256_cbc_enc(key, (uint8_t *) iv, len_bytes, in, out);
+	} else {
+		fprintf(stderr, "Invalid key length: %d\n", k_len);
+		return 1;
+	}
+	return 0;
+}
+
+int OpenSslDec(uint8_t k_len,
+	       uint8_t * key, uint8_t * in, uint8_t * iv, uint8_t * out, uint64_t len_bytes)
+{
+	if (CBC_128_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+		printf(" OpenSSL128 ");
+#endif
+		openssl_aes_128_cbc_dec(key, (uint8_t *) iv, len_bytes, in, out);
+	} else if (CBC_192_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+		printf(" OpenSSL192 ");
+#endif
+		openssl_aes_192_cbc_dec(key, (uint8_t *) iv, len_bytes, in, out);
+	} else if (CBC_256_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+		printf(" OpenSSL256 ");
+#endif
+		openssl_aes_256_cbc_dec(key, (uint8_t *) iv, len_bytes, in, out);
+	} else {
+		fprintf(stderr, "Invalid key length: %d\n", k_len);
+		return 1;
+	}
+	return 0;
+}
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+	int i;
+	for (i = 0; i < size; i++) {
+		*data++ = rand();
+	}
+}
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+	int mismatch;
+	int OK = 0;
+	uint64_t a;
+
+	mismatch = memcmp(test, expected, len);
+	if (!mismatch) {
+		return OK;
+
+	} else {
+		OK = 1;
+		printf("  failed %s \t\t", data_name);
+		for (a = 0; a < len; a++) {
+			if (test[a] != expected[a]) {
+				printf(" '%x' != '%x' at %lx of %lx\n",
+				       test[a], expected[a], a, len);
+				break;
+			}
+		}
+	}
+	return OK;
+}
+
+int check_vector(struct cbc_vector *vector)
+{
+	uint8_t *pt_test = NULL;
+	uint8_t *o_ct_test = NULL;
+	int OK = 0;
+	aes_cbc_generic enc;
+	aes_cbc_generic dec;
+
+#ifdef CBC_VECTORS_VERBOSE
+	printf(" Keylen:%d PLen:%d ", (int)vector->K_LEN, (int)vector->P_LEN);
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+	printf(" K:%p P:%p C:%p IV:%p expC:%p Keys:%p ", vector->K, vector->P, vector->C,
+	       vector->IV, vector->EXP_C, vector->KEYS);
+#endif
+	fflush(0);
+#else
+	printf(".");
+#endif
+
+	if (CBC_128_BITS == vector->K_LEN) {
+		enc = (aes_cbc_generic) & aes_cbc_enc_128;
+		dec = (aes_cbc_generic) & aes_cbc_dec_128;
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+		printf(" CBC128 ");
+#endif
+	} else if (CBC_192_BITS == vector->K_LEN) {
+		enc = (aes_cbc_generic) & aes_cbc_enc_192;
+		dec = (aes_cbc_generic) & aes_cbc_dec_192;
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+		printf(" CBC192 ");
+#endif
+	} else if (CBC_256_BITS == vector->K_LEN) {
+		enc = (aes_cbc_generic) & aes_cbc_enc_256;
+		dec = (aes_cbc_generic) & aes_cbc_dec_256;
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+		printf(" CBC256 ");
+#endif
+	} else {
+		printf("Invalid key length: %d\n", vector->K_LEN);
+		return 1;
+	}
+
+	// Allocate space for the calculated ciphertext
+	pt_test = malloc(vector->P_LEN);
+	o_ct_test = malloc(vector->P_LEN);
+	if ((pt_test == NULL) || (o_ct_test == NULL)) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+
+	aes_cbc_precomp(vector->K, vector->K_LEN, vector->KEYS);
+
+#ifdef CBC_VECTORS_VERBOSE
+	fflush(0);
+#endif
+	////
+	// ISA-l Encrypt
+	////
+	enc(vector->P, vector->IV, vector->KEYS->enc_keys, vector->C, vector->P_LEN);
+	if (NULL != vector->EXP_C) {	//when the encrypted text is know verify correct
+		OK |=
+		    check_data(vector->EXP_C, vector->C, vector->P_LEN,
+			       "ISA-L expected cypher text (C)");
+	}
+	OpenSslEnc(vector->K_LEN, vector->K, vector->P, vector->IV, o_ct_test, vector->P_LEN);
+	OK |=
+	    check_data(vector->C, o_ct_test, vector->P_LEN,
+		       "OpenSSL vs ISA-L cypher text (C)");
+
+	memcpy(pt_test, vector->P, vector->P_LEN);
+	memset(vector->P, 0, vector->P_LEN);
+#ifdef CBC_VECTORS_VERBOSE
+	fflush(0);
+#endif
+
+	////
+	// ISA-l Decrypt
+	////
+	dec(vector->C, vector->IV, vector->KEYS->dec_keys, vector->P, vector->P_LEN);
+	OK |= check_data(vector->P, pt_test, vector->P_LEN, "ISA-L decrypted plain text (P)");
+	memset(vector->P, 0, vector->P_LEN);
+	dec(o_ct_test, vector->IV, vector->KEYS->dec_keys, vector->P, vector->P_LEN);
+	OK |= check_data(vector->P, pt_test, vector->P_LEN, "ISA-L decrypted OpenSSL (P)");
+	memset(vector->P, 0, vector->P_LEN);
+	OpenSslDec(vector->K_LEN, vector->K, vector->C, vector->IV, vector->P, vector->P_LEN);
+	OK |= check_data(vector->P, pt_test, vector->P_LEN, "OpenSSL decrypted ISA-L (P)");
+#ifdef CBC_VECTORS_VERBOSE
+	if (OK)
+		printf("Failed");
+	else
+		printf("Passed");
+
+	printf("\n");
+#endif
+
+	return OK;
+}
+
+int test_std_combinations(void)
+{
+	int const vectors_cnt = sizeof(cbc_vectors) / sizeof(cbc_vectors[0]);
+	int i, ret;
+	uint8_t *iv = NULL;
+
+	printf("AES CBC standard test vectors:");
+#ifdef CBC_VECTORS_VERBOSE
+	printf("\n");
+#endif
+	ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+	if ((0 != ret) || (NULL == iv))
+		return 1;
+
+	for (i = 0; (i < vectors_cnt); i++) {
+		struct cbc_vector vect = cbc_vectors[i];
+
+		ret = posix_memalign((void **)&vect.KEYS, 16, (sizeof(*vect.KEYS)));
+		if ((0 != ret) || (NULL == vect.KEYS))
+			return 1;
+		// IV data must be aligned to 16 byte boundary so move data in aligned buffer and change out the pointer
+		memcpy(iv, vect.IV, CBC_IV_DATA_LEN);
+		vect.IV = iv;
+		vect.C = NULL;
+		vect.C = malloc(vect.P_LEN);
+		if ((NULL == vect.C))
+			return 1;
+#ifdef CBC_VECTORS_VERBOSE
+		printf("vector[%d of %d] ", i, vectors_cnt);
+#endif
+		if (0 == (i % 25))
+			printf("\n");
+		if (0 == (i % 10))
+			fflush(0);
+
+		if (0 != check_vector(&vect))
+			return 1;
+
+		aligned_free(vect.KEYS);
+		free(vect.C);
+	}
+
+	aligned_free(iv);
+	printf("\n");
+	return 0;
+}
+
+int test_random_combinations(void)
+{
+	struct cbc_vector test;
+	int t, ret;
+
+	printf("AES CBC random test vectors:");
+#ifdef CBC_VECTORS_VERBOSE
+	fflush(0);
+#endif
+	test.IV = NULL;
+	ret = posix_memalign((void **)&test.IV, 16, (CBC_IV_DATA_LEN));
+	if ((0 != ret) || (NULL == test.IV))
+		return 1;
+	test.KEYS = NULL;
+	ret = posix_memalign((void **)&test.KEYS, 16, (sizeof(*test.KEYS)));
+	if ((0 != ret) || (NULL == test.KEYS))
+		return 1;
+
+	for (t = 0; RANDOMS > t; t++) {
+		int Plen = 16 + ((rand() % TEST_LEN) & ~0xf);	//must be a 16byte multiple
+		int offset = (rand() % MAX_UNALINED);
+		int Kindex = (rand() % (sizeof(Ksize) / sizeof(Ksize[0])));	// select one of the valid key sizes
+
+		if (0 == (t % 25))
+			printf("\n");
+		if (0 == (t % 10))
+			fflush(0);
+
+		test.C = NULL;
+		test.P = NULL;
+		test.K = NULL;
+		test.EXP_C = NULL;
+		test.P_LEN = Plen;
+		test.K_LEN = Ksize[Kindex];
+
+		test.P = malloc(test.P_LEN + offset);
+		test.C = malloc(test.P_LEN + offset);
+		test.K = malloc(test.K_LEN + offset);
+		if ((NULL == test.P) || (NULL == test.C) || (NULL == test.K)) {
+			printf("malloc of testsize:0x%x failed\n", Plen);
+			return -1;
+		}
+		test.P += offset;
+		test.C += offset;
+		test.K += offset;
+
+		mk_rand_data(test.P, test.P_LEN);
+		mk_rand_data(test.K, test.K_LEN);
+		mk_rand_data(test.IV, CBC_IV_DATA_LEN);
+
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+		printf(" Offset:0x%x ", offset);
+#endif
+		if (0 != check_vector(&test))
+			return 1;
+
+		test.C -= offset;
+		free(test.C);
+		test.K -= offset;
+		free(test.K);
+		test.P -= offset;
+		free(test.P);
+	}
+
+	aligned_free(test.IV);
+	aligned_free(test.KEYS);
+	printf("\n");
+	return 0;
+}
+
+int test_efence_combinations(void)
+{
+	struct cbc_vector test;
+	int offset = 0;
+	int key_idx;
+	uint8_t *P = NULL, *C = NULL, *K = NULL, *IV = NULL;
+	uint8_t *key_data = NULL;
+
+	P = malloc(PAGE_LEN);
+	C = malloc(PAGE_LEN);
+	K = malloc(PAGE_LEN);
+	IV = malloc(PAGE_LEN);
+	key_data = malloc(PAGE_LEN);
+
+	if ((NULL == P) || (NULL == C) || (NULL == K) || (NULL == IV)
+	    || (NULL == key_data)
+	    ) {
+		printf("malloc of testsize:0x%x failed\n", PAGE_LEN);
+		return -1;
+	}
+	// place buffers to end at page boundary
+	test.P_LEN = PAGE_LEN / 2;
+	test.EXP_C = NULL;
+
+	printf("AES CBC efence test vectors:");
+	for (key_idx = 0; key_idx < (sizeof(Ksize) / sizeof(Ksize[0])); key_idx++) {
+		test.K_LEN = Ksize[key_idx];
+
+		for (offset = 0; MAX_UNALINED > offset; offset++) {
+			if (0 == (offset % 80))
+				printf("\n");
+			// move the start and size of the data block towards the end of the page
+			test.P_LEN = ((PAGE_LEN / (1 + (2 * offset))) & ~0xff);	// must be a multiple of 16
+			if (16 > test.P_LEN)
+				test.P_LEN = 16;
+			//Place data at end of page
+			test.P = P + PAGE_LEN - test.P_LEN - offset;
+			test.C = C + PAGE_LEN - test.P_LEN - offset;
+			test.K = K + PAGE_LEN - test.K_LEN - offset;
+			test.IV = IV + PAGE_LEN - CBC_IV_DATA_LEN - offset;
+			test.IV = test.IV - ((uint64_t) test.IV & 0xff);	// align to 16 byte boundary
+			test.KEYS = (struct cbc_key_data *)
+			    (key_data + PAGE_LEN - sizeof(*test.KEYS) - offset);
+			test.KEYS = (struct cbc_key_data *)
+			    ((uint8_t *) test.KEYS - ((uint64_t) test.KEYS & 0xff));	// align to 16 byte boundary
+
+			mk_rand_data(test.P, test.P_LEN);
+			mk_rand_data(test.K, test.K_LEN);
+			mk_rand_data(test.IV, CBC_IV_DATA_LEN);
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+			printf(" Offset:0x%x ", offset);
+#endif
+			if (0 != check_vector(&test))
+				return 1;
+		}
+
+	}
+
+	free(P);
+	free(C);
+	free(K);
+	free(IV);
+	free(key_data);
+	printf("\n");
+	return 0;
+}
+
+int main(void)
+{
+	uint32_t OK = 0;
+
+	srand(TEST_SEED);
+	OK |= test_std_combinations();
+	OK |= test_random_combinations();
+	OK |= test_efence_combinations();
+	if (0 == OK) {
+		printf("...Pass\n");
+	} else {
+		printf("...Fail\n");
+	}
+	return OK;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c
new file mode 100644
index 000000000..0558b4254
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c
@@ -0,0 +1,183 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/*
+ * Run list of standard CBC test vectors through encode and decode checks.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <aes_cbc.h>
+#include "types.h"
+#include "cbc_std_vectors.h"
+
+typedef void (*aes_cbc_generic)(uint8_t * in, uint8_t * IV, uint8_t * keys, uint8_t * out,
+				uint64_t len_bytes);
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+	int mismatch;
+	int OK = 0;
+	uint64_t a;
+
+	mismatch = memcmp(test, expected, len);
+	if (!mismatch) {
+		return OK;
+
+	} else {
+		OK = 1;
+		printf("  failed %s \t\t", data_name);
+		for (a = 0; a < len; a++) {
+			if (test[a] != expected[a]) {
+				printf(" '%x' != '%x' at %lx of %lx\n",
+				       test[a], expected[a], a, len);
+				break;
+			}
+		}
+	}
+	return OK;
+}
+
+int check_vector(struct cbc_vector *vector)
+{
+	uint8_t *pt_test = NULL;
+	int OK = 0;
+	aes_cbc_generic enc;
+	aes_cbc_generic dec;
+
+	DEBUG_PRINT((" Keylen:%d PLen:%d ", (int)vector->K_LEN, (int)vector->P_LEN));
+	DEBUG_PRINT((" K:%p P:%p C:%p IV:%p expC:%p Keys:%p ", vector->K, vector->P, vector->C,
+		     vector->IV, vector->EXP_C, vector->KEYS));
+	printf(".");
+
+	switch (vector->K_LEN) {
+	case CBC_128_BITS:
+		enc = (aes_cbc_generic) & aes_cbc_enc_128;
+		dec = (aes_cbc_generic) & aes_cbc_dec_128;
+		DEBUG_PRINT((" CBC128 "));
+		break;
+	case CBC_192_BITS:
+		enc = (aes_cbc_generic) & aes_cbc_enc_192;
+		dec = (aes_cbc_generic) & aes_cbc_dec_192;
+		DEBUG_PRINT((" CBC192 "));
+		break;
+	case CBC_256_BITS:
+		enc = (aes_cbc_generic) & aes_cbc_enc_256;
+		dec = (aes_cbc_generic) & aes_cbc_dec_256;
+		DEBUG_PRINT((" CBC256 "));
+		break;
+	default:
+		printf("Invalid key length: %d\n", vector->K_LEN);
+		return 1;
+	}
+
+	// Allocate space for the calculated ciphertext
+	pt_test = malloc(vector->P_LEN);
+
+	if (pt_test == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+
+	aes_cbc_precomp(vector->K, vector->K_LEN, vector->KEYS);
+
+	////
+	// ISA-l Encrypt
+	////
+	enc(vector->P, vector->IV, vector->KEYS->enc_keys, vector->C, vector->P_LEN);
+
+	if (NULL != vector->EXP_C) {	//when the encrypted text is known verify correct
+		OK |= check_data(vector->EXP_C, vector->C, vector->P_LEN,
+				 "ISA-L expected cypher text (C)");
+	}
+	memcpy(pt_test, vector->P, vector->P_LEN);
+	memset(vector->P, 0, vector->P_LEN);
+
+	////
+	// ISA-l Decrypt
+	////
+	dec(vector->C, vector->IV, vector->KEYS->dec_keys, vector->P, vector->P_LEN);
+	OK |= check_data(vector->P, pt_test, vector->P_LEN, "ISA-L decrypted plain text (P)");
+	DEBUG_PRINT((OK ? "Failed\n" : "Passed\n"));
+
+	free(pt_test);
+	return OK;
+}
+
+int test_std_combinations(void)
+{
+	int const vectors_cnt = sizeof(cbc_vectors) / sizeof(cbc_vectors[0]);
+	int i, ret;
+	uint8_t *iv = NULL;
+
+	printf("AES CBC standard test vectors: ");
+
+	ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+	if ((0 != ret) || (NULL == iv))
+		return 1;
+
+	for (i = 0; (i < vectors_cnt); i++) {
+		struct cbc_vector vect = cbc_vectors[i];
+
+		ret = posix_memalign((void **)&(vect.KEYS), 16, sizeof(*vect.KEYS));
+		if ((0 != ret) || (NULL == vect.KEYS))
+			return 1;
+
+		// IV data must be aligned to 16 byte boundary so move data in
+		// aligned buffer and change out the pointer
+		memcpy(iv, vect.IV, CBC_IV_DATA_LEN);
+		vect.IV = iv;
+		vect.C = malloc(vect.P_LEN);
+		if (NULL == vect.C)
+			return 1;
+
+		DEBUG_PRINT(("vector[%d of %d] ", i, vectors_cnt));
+
+		if (0 != check_vector(&vect))
+			return 1;
+
+		aligned_free(vect.KEYS);
+		free(vect.C);
+	}
+
+	aligned_free(iv);
+	return 0;
+}
+
+int main(void)
+{
+	uint32_t OK = 0;
+
+	OK = test_std_combinations();
+
+	printf(0 == OK ? "Pass\n" : "Fail\n");
+	return OK;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/clear_regs.asm b/src/crypto/isa-l/isa-l_crypto/aes/clear_regs.asm
new file mode 100644
index 000000000..2c80401e9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/clear_regs.asm
@@ -0,0 +1,202 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef _CLEAR_REGS_ASM_
+%define _CLEAR_REGS_ASM_
+
+%ifndef LINUX
+%ifidn __OUTPUT_FORMAT__, elf64
+%define LINUX
+%endif
+%endif
+
+;
+; This macro clears any GP registers passed
+;
+%macro clear_gps 1-16
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+        xor %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears any XMM registers passed on SSE
+;
+%macro clear_xmms_sse 1-16
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+        pxor    %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears any XMM registers passed on AVX
+;
+%macro clear_xmms_avx 1-16
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+        vpxor   %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears any YMM registers passed
+;
+%macro clear_ymms 1-16
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+        vpxor   %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears any ZMM registers passed
+;
+%macro clear_zmms 1-32
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+        vpxorq  %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears all scratch GP registers
+; for Windows or Linux
+;
+%macro clear_scratch_gps_asm 0
+        clear_gps rax, rcx, rdx, r8, r9, r10, r11
+%ifdef LINUX
+        clear_gps rdi, rsi
+%endif
+%endmacro
+
+;
+; This macro clears all scratch XMM registers on SSE
+;
+%macro clear_scratch_xmms_sse_asm 0
+%ifdef LINUX
+%assign i 0
+%rep 16
+        pxor    xmm %+ i, xmm %+ i
+%assign i (i+1)
+%endrep
+; On Windows, XMM0-XMM5 registers are scratch registers
+%else
+%assign i 0
+%rep 6
+        pxor    xmm %+ i, xmm %+ i
+%assign i (i+1)
+%endrep
+%endif ; LINUX
+%endmacro
+
+;
+; This macro clears all scratch XMM registers on AVX
+;
+%macro clear_scratch_xmms_avx_asm 0
+%ifdef LINUX
+        vzeroall
+; On Windows, XMM0-XMM5 registers are scratch registers
+%else
+%assign i 0
+%rep 6
+        vpxor   xmm %+ i, xmm %+ i
+%assign i (i+1)
+%endrep
+%endif ; LINUX
+%endmacro
+
+;
+; This macro clears all scratch YMM registers
+;
+; It should be called before restoring the XMM registers
+; for Windows (XMM6-XMM15)
+;
+%macro clear_scratch_ymms_asm 0
+; On Linux, all YMM registers are scratch registers
+%ifdef LINUX
+        vzeroall
+; On Windows, YMM0-YMM5 registers are scratch registers.
+; YMM6-YMM15 upper 128 bits are scratch registers too, but
+; the lower 128 bits are to be restored after calling these function
+; which clears the upper bits too.
+%else
+%assign i 0
+%rep 6
+        vpxor   ymm %+ i, ymm %+ i
+%assign i (i+1)
+%endrep
+%endif ; LINUX
+%endmacro
+
+;
+; This macro clears all scratch ZMM registers
+;
+; It should be called before restoring the XMM registers
+; for Windows (XMM6-XMM15). YMM registers are used
+; on purpose, since XOR'ing YMM registers is faster
+; than XOR'ing ZMM registers, and the operation clears
+; also the upper 256 bits
+;
+%macro clear_scratch_zmms_asm 0
+; On Linux, all ZMM registers are scratch registers
+%ifdef LINUX
+        vzeroall
+        ;; vzeroall only clears the first 16 ZMM registers
+%assign i 16
+%rep 16
+        vpxorq  ymm %+ i, ymm %+ i
+%assign i (i+1)
+%endrep
+; On Windows, ZMM0-ZMM5 and ZMM16-ZMM31 registers are scratch registers.
+; ZMM6-ZMM15 upper 384 bits are scratch registers too, but
+; the lower 128 bits are to be restored after calling these function
+; which clears the upper bits too.
+%else
+%assign i 0
+%rep 6
+        vpxorq  ymm %+ i, ymm %+ i
+%assign i (i+1)
+%endrep
+
+%assign i 16
+%rep 16
+        vpxorq  ymm %+ i, ymm %+ i
+%assign i (i+1)
+%endrep
+%endif ; LINUX
+%endmacro
+
+%endif ;; _CLEAR_REGS_ASM
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm
new file mode 100644
index 000000000..98304c552
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%include "gcm_avx_gen2.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2_nt.asm
new file mode 100644
index 000000000..5ee5e7b48
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2_nt.asm
@@ -0,0 +1,33 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%define NT_LDST
+%define FUNCT_EXTENSION _nt
+%include "gcm_avx_gen2.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm
new file mode 100644
index 000000000..902c17237
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%include "gcm_avx_gen4.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4_nt.asm
new file mode 100644
index 000000000..1e55d24cf
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4_nt.asm
@@ -0,0 +1,33 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%define NT_LDST
+%define FUNCT_EXTENSION _nt
+%include "gcm_avx_gen4.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm
new file mode 100644
index 000000000..1717a8662
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%include "gcm_sse.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse_nt.asm
new file mode 100644
index 000000000..d17402bea
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse_nt.asm
@@ -0,0 +1,33 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%define NT_LDST
+%define FUNCT_EXTENSION _nt
+%include "gcm_sse.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512.asm
new file mode 100644
index 000000000..71f284789
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512.asm
@@ -0,0 +1,32 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+;; single buffer implementation
+%include "gcm_vaes_avx512.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512_nt.asm
new file mode 100644
index 000000000..c0c587133
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512_nt.asm
@@ -0,0 +1,33 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%define NT_LDST
+%define FUNCT_EXTENSION _nt
+%include "gcm_vaes_avx512.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm
new file mode 100644
index 000000000..4b159cefb
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%include "gcm_avx_gen2.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2_nt.asm
new file mode 100644
index 000000000..822ef07cc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2_nt.asm
@@ -0,0 +1,33 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%define NT_LDST
+%define FUNCT_EXTENSION _nt
+%include "gcm_avx_gen2.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm
new file mode 100644
index 000000000..f6050a8ff
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%include "gcm_avx_gen4.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4_nt.asm
new file mode 100644
index 000000000..5959d698f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4_nt.asm
@@ -0,0 +1,33 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%define NT_LDST
+%define FUNCT_EXTENSION _nt
+%include "gcm_avx_gen4.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm
new file mode 100644
index 000000000..c583d02b8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%include "gcm_sse.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse_nt.asm
new file mode 100644
index 000000000..5952a6005
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse_nt.asm
@@ -0,0 +1,33 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%define NT_LDST
+%define FUNCT_EXTENSION _nt
+%include "gcm_sse.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512.asm
new file mode 100644
index 000000000..bd318fcd1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512.asm
@@ -0,0 +1,32 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+;; single buffer implementation
+%include "gcm_vaes_avx512.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512_nt.asm
new file mode 100644
index 000000000..da2f611b4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512_nt.asm
@@ -0,0 +1,33 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%define NT_LDST
+%define FUNCT_EXTENSION _nt
+%include "gcm_vaes_avx512.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen2.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen2.asm
new file mode 100644
index 000000000..90db18910
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen2.asm
@@ -0,0 +1,2130 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+;       Erdinc Ozturk
+;       Vinodh Gopal
+;       James Guilford
+;
+;
+; References:
+;       This code was derived and highly optimized from the code described in paper:
+;               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+;       For the shift-based reductions used in this code, we used the method described in paper:
+;               Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+;       0                   1                   2                   3
+;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                             Salt  (From the SA)               |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                     Initialization Vector                     |
+;       |         (This is the sequence number from IPSec header)       |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                              0x1                              |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+;       AAD will be padded with 0 to the next 16byte multiple
+;       for example, assume AAD is a u32 vector
+;
+;       if AAD is 8 bytes:
+;       AAD[3] = {A0, A1};
+;       padded AAD in xmm register = {A1 A0 0 0}
+;
+;       0                   1                   2                   3
+;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                               SPI (A1)                        |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                     32-bit Sequence Number (A0)               |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                              0x0                              |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;                                       AAD Format with 32-bit Sequence Number
+;
+;       if AAD is 12 bytes:
+;       AAD[3] = {A0, A1, A2};
+;       padded AAD in xmm register = {A2 A1 A0 0}
+;
+;       0                   1                   2                   3
+;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                               SPI (A2)                        |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                 64-bit Extended Sequence Number {A1,A0}       |
+;       |                                                               |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                              0x0                              |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;        AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+;       Must be a multiple of 4 bytes and from the definition of the spec.
+;       The code additionally supports any aadLen length.
+;
+; TLen:
+;       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+%ifndef GCM128_MODE
+%ifndef GCM192_MODE
+%ifndef GCM256_MODE
+%error "No GCM mode selected for gcm_avx_gen2.asm!"
+%endif
+%endif
+%endif
+
+%ifndef FUNCT_EXTENSION
+%define FUNCT_EXTENSION
+%endif
+
+%ifdef GCM128_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen2 %+ FUNCT_EXTENSION
+%define NROUNDS 9
+%endif
+
+%ifdef GCM192_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen2 %+ FUNCT_EXTENSION
+%define NROUNDS 11
+%endif
+
+%ifdef GCM256_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen2 %+ FUNCT_EXTENSION
+%define NROUNDS 13
+%endif
+
+default rel
+; need to push 5 registers into stack to maintain
+%define STACK_OFFSET 8*5
+
+%define	TMP2	16*0    ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define	TMP3	16*1    ; Temporary storage for AES State 3
+%define	TMP4	16*2    ; Temporary storage for AES State 4
+%define	TMP5	16*3    ; Temporary storage for AES State 5
+%define	TMP6	16*4    ; Temporary storage for AES State 6
+%define	TMP7	16*5    ; Temporary storage for AES State 7
+%define	TMP8	16*6    ; Temporary storage for AES State 8
+
+%define	LOCAL_STORAGE	16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define	XMM_STORAGE	16*10
+%else
+	%define	XMM_STORAGE	0
+%endif
+
+%define	VARIABLE_OFFSET	LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro  GHASH_MUL  7
+%define %%GH %1         ; 16 Bytes
+%define %%HK %2         ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; Karatsuba
+        vpshufd         %%T2, %%GH, 01001110b
+        vpshufd         %%T3, %%HK, 01001110b
+        vpxor           %%T2, %%T2, %%GH                ; %%T2 = (a1+a0)
+        vpxor           %%T3, %%T3, %%HK                ; %%T3 = (b1+b0)
+
+        vpclmulqdq      %%T1, %%GH, %%HK, 0x11          ; %%T1 = a1*b1
+        vpclmulqdq      %%GH, %%HK, 0x00                ; %%GH = a0*b0
+        vpclmulqdq      %%T2, %%T3, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+        vpxor           %%T2, %%T2, %%GH
+        vpxor           %%T2, %%T2, %%T1                ; %%T2 = a0*b1+a1*b0
+
+        vpslldq         %%T3, %%T2, 8                   ; shift-L %%T3 2 DWs
+        vpsrldq         %%T2, %%T2, 8                   ; shift-R %%T2 2 DWs
+        vpxor           %%GH, %%GH, %%T3
+        vpxor           %%T1, %%T1, %%T2                ; <%%T1:%%GH> = %%GH x %%HK
+
+        ;first phase of the reduction
+        vpslld  %%T2, %%GH, 31                          ; packed right shifting << 31
+        vpslld  %%T3, %%GH, 30                          ; packed right shifting shift << 30
+        vpslld  %%T4, %%GH, 25                          ; packed right shifting shift << 25
+
+        vpxor   %%T2, %%T2, %%T3                        ; xor the shifted versions
+        vpxor   %%T2, %%T2, %%T4
+
+        vpsrldq %%T5, %%T2, 4                           ; shift-R %%T5 1 DW
+
+        vpslldq %%T2, %%T2, 12                          ; shift-L %%T2 3 DWs
+        vpxor   %%GH, %%GH, %%T2                        ; first phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+        ;second phase of the reduction
+
+        vpsrld  %%T2,%%GH,1                             ; packed left shifting >> 1
+        vpsrld  %%T3,%%GH,2                             ; packed left shifting >> 2
+        vpsrld  %%T4,%%GH,7                             ; packed left shifting >> 7
+        vpxor   %%T2, %%T2, %%T3                        ; xor the shifted versions
+        vpxor   %%T2, %%T2, %%T4
+
+        vpxor   %%T2, %%T2, %%T5
+        vpxor   %%GH, %%GH, %%T2
+        vpxor   %%GH, %%GH, %%T1                        ; the result is in %%GH
+
+
+%endmacro
+
+
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK    %2
+%define %%T1    %3
+%define %%T2    %4
+%define %%T3    %5
+%define %%T4    %6
+%define %%T5    %7
+%define %%T6    %8
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+        vmovdqa  %%T5, %%HK
+
+        vpshufd  %%T1, %%T5, 01001110b
+        vpxor    %%T1, %%T5
+        vmovdqu  [%%GDATA + HashKey_k], %%T1
+
+        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^2<<1 mod poly
+        vmovdqu  [%%GDATA + HashKey_2], %%T5                    ;  [HashKey_2] = HashKey^2<<1 mod poly
+        vpshufd  %%T1, %%T5, 01001110b
+        vpxor    %%T1, %%T5
+        vmovdqu  [%%GDATA + HashKey_2_k], %%T1
+
+        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^3<<1 mod poly
+        vmovdqu  [%%GDATA + HashKey_3], %%T5
+        vpshufd  %%T1, %%T5, 01001110b
+        vpxor    %%T1, %%T5
+        vmovdqu  [%%GDATA + HashKey_3_k], %%T1
+
+        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^4<<1 mod poly
+        vmovdqu  [%%GDATA + HashKey_4], %%T5
+        vpshufd  %%T1, %%T5, 01001110b
+        vpxor    %%T1, %%T5
+        vmovdqu  [%%GDATA + HashKey_4_k], %%T1
+
+        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^5<<1 mod poly
+        vmovdqu  [%%GDATA + HashKey_5], %%T5
+        vpshufd  %%T1, %%T5, 01001110b
+        vpxor    %%T1, %%T5
+        vmovdqu  [%%GDATA + HashKey_5_k], %%T1
+
+        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^6<<1 mod poly
+        vmovdqu  [%%GDATA + HashKey_6], %%T5
+        vpshufd  %%T1, %%T5, 01001110b
+        vpxor    %%T1, %%T5
+        vmovdqu  [%%GDATA + HashKey_6_k], %%T1
+
+        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^7<<1 mod poly
+        vmovdqu  [%%GDATA + HashKey_7], %%T5
+        vpshufd  %%T1, %%T5, 01001110b
+        vpxor    %%T1, %%T5
+        vmovdqu  [%%GDATA + HashKey_7_k], %%T1
+
+        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^8<<1 mod poly
+        vmovdqu  [%%GDATA + HashKey_8], %%T5
+        vpshufd  %%T1, %%T5, 01001110b
+        vpxor    %%T1, %%T5
+        vmovdqu  [%%GDATA + HashKey_8_k], %%T1
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT	6
+%define	%%OUTPUT		%1 ; %%OUTPUT is an xmm register
+%define	%%INPUT			%2
+%define	%%LENGTH		%3
+%define	%%END_READ_LOCATION	%4 ; All this and the lower inputs are temp registers
+%define	%%COUNTER		%5
+%define	%%TMP1			%6
+
+	vpxor	%%OUTPUT, %%OUTPUT
+	mov	%%COUNTER, %%LENGTH
+	mov	%%END_READ_LOCATION, %%INPUT
+	add	%%END_READ_LOCATION, %%LENGTH
+	xor	%%TMP1, %%TMP1
+
+
+	cmp	%%COUNTER, 8
+	jl	%%_byte_loop_2
+	vpinsrq	%%OUTPUT, [%%INPUT],0		;Read in 8 bytes if they exists
+	je	%%_done
+
+	sub	%%COUNTER, 8
+
+%%_byte_loop_1:					;Read in data 1 byte at a time while data is left
+	shl	%%TMP1, 8			;This loop handles when 8 bytes were already read in
+	dec	%%END_READ_LOCATION
+	mov	BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+	dec	%%COUNTER
+	jg	%%_byte_loop_1
+	vpinsrq	%%OUTPUT, %%TMP1, 1
+	jmp	%%_done
+
+%%_byte_loop_2:					;Read in data 1 byte at a time while data is left
+	cmp	%%COUNTER, 0
+	je	%%_done
+	shl	%%TMP1, 8			;This loop handles when no bytes were already read in
+	dec	%%END_READ_LOCATION
+	mov	BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+	dec	%%COUNTER
+	jg	%%_byte_loop_2
+	vpinsrq	%%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro	CALC_AAD_HASH	14
+%define	%%A_IN		%1
+%define	%%A_LEN		%2
+%define	%%AAD_HASH	%3
+%define	%%HASH_KEY	%4
+%define	%%XTMP1		%5	; xmm temp reg 5
+%define	%%XTMP2		%6
+%define	%%XTMP3		%7
+%define	%%XTMP4		%8
+%define	%%XTMP5		%9	; xmm temp reg 5
+%define	%%T1		%10	; temp reg 1
+%define	%%T2		%11
+%define	%%T3		%12
+%define	%%T4		%13
+%define	%%T5		%14	; temp reg 5
+
+
+	mov	%%T1, %%A_IN		; T1 = AAD
+	mov	%%T2, %%A_LEN		; T2 = aadLen
+	vpxor	%%AAD_HASH, %%AAD_HASH
+
+	cmp	%%T2, 16
+	jl	%%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+	vmovdqu	%%XTMP1, [%%T1]
+	;byte-reflect the AAD data
+	vpshufb	%%XTMP1, [SHUF_MASK]
+	vpxor	%%AAD_HASH, %%XTMP1
+	GHASH_MUL	%%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+	sub	%%T2, 16
+	je	%%_CALC_AAD_done
+
+	add	%%T1, 16
+	cmp	%%T2, 16
+	jge	%%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+	READ_SMALL_DATA_INPUT	%%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+	;byte-reflect the AAD data
+	vpshufb	%%XTMP1, [SHUF_MASK]
+	vpxor	%%AAD_HASH, %%XTMP1
+	GHASH_MUL	%%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input:
+;  GDATA_KEY - struct gcm_key_data *
+;  GDATA_CTX - struct gcm_context_data *
+;  PLAIN_CYPH_IN - input text
+;  PLAIN_CYPH_LEN - input text length
+;  DATA_OFFSET - the current data offset
+;  ENC_DEC - whether encoding or decoding
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK	8
+%define	%%GDATA_KEY		%1
+%define	%%GDATA_CTX		%2
+%define	%%CYPH_PLAIN_OUT	%3
+%define	%%PLAIN_CYPH_IN		%4
+%define	%%PLAIN_CYPH_LEN	%5
+%define	%%DATA_OFFSET		%6
+%define	%%AAD_HASH		%7
+%define	%%ENC_DEC		%8
+	mov	r13, [%%GDATA_CTX + PBlockLen]
+	cmp	r13, 0
+	je	%%_partial_block_done		;Leave Macro if no partial blocks
+
+	cmp	%%PLAIN_CYPH_LEN, 16		;Read in input data without over reading
+	jl	%%_fewer_than_16_bytes
+	VXLDR	xmm1, [%%PLAIN_CYPH_IN]		;If more than 16 bytes of data, just fill the xmm register
+	jmp	%%_data_read
+
+%%_fewer_than_16_bytes:
+	lea	r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+	READ_SMALL_DATA_INPUT	xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+
+%%_data_read:				;Finished reading in data
+
+
+	vmovdqu	xmm9, [%%GDATA_CTX + PBlockEncKey]	;xmm9 = my_ctx_data.partial_block_enc_key
+	vmovdqu	xmm13, [%%GDATA_KEY + HashKey]
+
+	lea	r12, [SHIFT_MASK]
+
+	cmp	r13, rax
+	add	r12, r13			; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+	vmovdqu	xmm2, [r12]			; get the appropriate shuffle mask
+	vpshufb	xmm9, xmm2			;shift right r13 bytes
+
+%ifidn	%%ENC_DEC, DEC
+	vmovdqa	xmm3, xmm1
+	vpxor	xmm9, xmm1			; Cyphertext XOR E(K, Yn)
+
+	mov	r15, %%PLAIN_CYPH_LEN
+	add	r15, r13
+	sub	r15, 16				;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+	jge	%%_no_extra_mask_1		;Determine if if partial block is not being filled and shift mask accordingly
+	sub	r12, r15
+%%_no_extra_mask_1:
+
+	vmovdqu	xmm1, [r12 + ALL_F-SHIFT_MASK]	; get the appropriate mask to mask out bottom r13 bytes of xmm9
+	vpand	xmm9, xmm1			; mask out bottom r13 bytes of xmm9
+
+	vpand	xmm3, xmm1
+	vpshufb	xmm3, [SHUF_MASK]
+	vpshufb	xmm3, xmm2
+	vpxor	%%AAD_HASH, xmm3
+
+
+	cmp	r15,0
+	jl	%%_partial_incomplete_1
+
+	GHASH_MUL	%%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6	;GHASH computation for the last <16 Byte block
+	xor	rax,rax
+	mov	[%%GDATA_CTX + PBlockLen], rax
+	jmp	%%_dec_done
+%%_partial_incomplete_1:
+	add	[%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%%_dec_done:
+	vmovdqu	[%%GDATA_CTX + AadHash], %%AAD_HASH
+
+%else
+	vpxor	xmm9, xmm1	; Plaintext XOR E(K, Yn)
+
+	mov	r15, %%PLAIN_CYPH_LEN
+	add	r15, r13
+	sub	r15, 16				;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+	jge	%%_no_extra_mask_2		;Determine if if partial block is not being filled and shift mask accordingly
+	sub	r12, r15
+%%_no_extra_mask_2:
+
+	vmovdqu	xmm1, [r12 + ALL_F-SHIFT_MASK]	; get the appropriate mask to mask out bottom r13 bytes of xmm9
+	vpand	xmm9, xmm1			; mask out bottom r13  bytes of xmm9
+
+	vpshufb	xmm9, [SHUF_MASK]
+	vpshufb	xmm9, xmm2
+	vpxor	%%AAD_HASH, xmm9
+
+	cmp	r15,0
+	jl	%%_partial_incomplete_2
+
+	GHASH_MUL	%%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6	;GHASH computation for the last <16 Byte block
+	xor	rax,rax
+	mov	[%%GDATA_CTX + PBlockLen], rax
+	jmp	%%_encode_done
+%%_partial_incomplete_2:
+	add [%%GDATA_CTX+PBlockLen], %%PLAIN_CYPH_LEN
+%%_encode_done:
+	vmovdqu	[%%GDATA_CTX + AadHash], %%AAD_HASH
+
+	vpshufb	xmm9, [SHUF_MASK]	; shuffle xmm9 back to output as ciphertext
+	vpshufb	xmm9, xmm2
+%endif
+
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	; output encrypted Bytes
+	cmp	r15,0
+	jl	%%_partial_fill
+	mov	r12, r13
+	mov	r13, 16
+	sub	r13, r12			; Set r13 to be the number of bytes to write out
+	jmp	%%_count_set
+%%_partial_fill:
+	mov	r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+	vmovq	rax, xmm9
+	cmp	r13, 8
+	jle	%%_less_than_8_bytes_left
+
+	mov	[%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+	add	%%DATA_OFFSET, 8
+	vpsrldq	xmm9, xmm9, 8
+	vmovq	rax, xmm9
+	sub	r13, 8
+%%_less_than_8_bytes_left:
+	mov	BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+	add	%%DATA_OFFSET, 1
+	shr	rax, 8
+	sub	r13, 1
+	jne	%%_less_than_8_bytes_left
+         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 24
+%define	%%GDATA_KEY             %1
+%define	%%GDATA_CTX             %2
+%define	%%CYPH_PLAIN_OUT        %3
+%define	%%PLAIN_CYPH_IN         %4
+%define	%%LENGTH                %5
+%define	%%DATA_OFFSET           %6
+%define	%%num_initial_blocks    %7      ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define	%%T1		        %8
+%define	%%HASH_KEY	        %9
+%define	%%T3		        %10
+%define	%%T4		        %11
+%define	%%T5		        %12
+%define	%%CTR		        %13
+%define	%%XMM1		        %14
+%define	%%XMM2		        %15
+%define	%%XMM3		        %16
+%define	%%XMM4		        %17
+%define	%%XMM5		        %18
+%define	%%XMM6		        %19
+%define	%%XMM7		        %20
+%define	%%XMM8		        %21
+%define	%%T6		        %22
+%define	%%T_key		        %23
+%define	%%ENC_DEC	        %24
+
+%assign i (8-%%num_initial_blocks)
+	vmovdqu	reg(i), %%XMM8	; move AAD_HASH to temp reg
+	                        ; start AES for %%num_initial_blocks blocks
+	vmovdqu  %%CTR, [%%GDATA_CTX + CurCount]                   ; %%CTR = Y0
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+        vpaddd   %%CTR, [ONE]           ; INCR Y0
+        vmovdqa  reg(i), %%CTR
+        vpshufb  reg(i), [SHUF_MASK]     ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+        vmovdqu  %%T_key, [%%GDATA_KEY+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+        vpxor    reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep NROUNDS
+        vmovdqu  %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+        vaesenc  reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep                         ; NROUNDS
+
+
+vmovdqu  %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                vaesenclast      reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+                vpxor    reg(i), %%T1
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)            ; write back ciphertext for %%num_initial_blocks blocks
+                add     %%DATA_OFFSET, 16
+                %ifidn  %%ENC_DEC, DEC
+                vmovdqa  reg(i), %%T1
+                %endif
+                vpshufb  reg(i), [SHUF_MASK]     ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+        vpxor    reg(j), reg(i)
+        GHASH_MUL       reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6      ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+	; %%XMM8 has the current Hash Value
+        vmovdqa  %%T3, %%XMM8
+
+        cmp     %%LENGTH, 128
+        jl      %%_initial_blocks_done                  ; no need for precomputed constants
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+                vpaddd   %%CTR, [ONE]                   ; INCR Y0
+                vmovdqa  %%XMM1, %%CTR
+                vpshufb  %%XMM1, [SHUF_MASK]             ; perform a 16Byte swap
+
+                vpaddd   %%CTR, [ONE]                   ; INCR Y0
+                vmovdqa  %%XMM2, %%CTR
+                vpshufb  %%XMM2, [SHUF_MASK]             ; perform a 16Byte swap
+
+                vpaddd   %%CTR, [ONE]                   ; INCR Y0
+                vmovdqa  %%XMM3, %%CTR
+                vpshufb  %%XMM3, [SHUF_MASK]             ; perform a 16Byte swap
+
+                vpaddd   %%CTR, [ONE]                   ; INCR Y0
+                vmovdqa  %%XMM4, %%CTR
+                vpshufb  %%XMM4, [SHUF_MASK]             ; perform a 16Byte swap
+
+                vpaddd   %%CTR, [ONE]                   ; INCR Y0
+                vmovdqa  %%XMM5, %%CTR
+                vpshufb  %%XMM5, [SHUF_MASK]             ; perform a 16Byte swap
+
+                vpaddd   %%CTR, [ONE]                   ; INCR Y0
+                vmovdqa  %%XMM6, %%CTR
+                vpshufb  %%XMM6, [SHUF_MASK]             ; perform a 16Byte swap
+
+                vpaddd   %%CTR, [ONE]                   ; INCR Y0
+                vmovdqa  %%XMM7, %%CTR
+                vpshufb  %%XMM7, [SHUF_MASK]             ; perform a 16Byte swap
+
+                vpaddd   %%CTR, [ONE]                   ; INCR Y0
+                vmovdqa  %%XMM8, %%CTR
+                vpshufb  %%XMM8, [SHUF_MASK]             ; perform a 16Byte swap
+
+                vmovdqu  %%T_key, [%%GDATA_KEY+16*0]
+                vpxor    %%XMM1, %%T_key
+                vpxor    %%XMM2, %%T_key
+                vpxor    %%XMM3, %%T_key
+                vpxor    %%XMM4, %%T_key
+                vpxor    %%XMM5, %%T_key
+                vpxor    %%XMM6, %%T_key
+                vpxor    %%XMM7, %%T_key
+                vpxor    %%XMM8, %%T_key
+
+
+%assign i 1
+%rep    NROUNDS
+                vmovdqu  %%T_key, [%%GDATA_KEY+16*i]
+                vaesenc  %%XMM1, %%T_key
+                vaesenc  %%XMM2, %%T_key
+                vaesenc  %%XMM3, %%T_key
+                vaesenc  %%XMM4, %%T_key
+                vaesenc  %%XMM5, %%T_key
+                vaesenc  %%XMM6, %%T_key
+                vaesenc  %%XMM7, %%T_key
+                vaesenc  %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+                vmovdqu          %%T_key, [%%GDATA_KEY+16*i]
+                vaesenclast      %%XMM1, %%T_key
+                vaesenclast      %%XMM2, %%T_key
+                vaesenclast      %%XMM3, %%T_key
+                vaesenclast      %%XMM4, %%T_key
+                vaesenclast      %%XMM5, %%T_key
+                vaesenclast      %%XMM6, %%T_key
+                vaesenclast      %%XMM7, %%T_key
+                vaesenclast      %%XMM8, %%T_key
+
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+                vpxor    %%XMM1, %%T1
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+                %ifidn  %%ENC_DEC, DEC
+                vmovdqa  %%XMM1, %%T1
+                %endif
+
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+                vpxor    %%XMM2, %%T1
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+                %ifidn  %%ENC_DEC, DEC
+                vmovdqa  %%XMM2, %%T1
+                %endif
+
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+                vpxor    %%XMM3, %%T1
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+                %ifidn  %%ENC_DEC, DEC
+                vmovdqa  %%XMM3, %%T1
+                %endif
+
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+                vpxor    %%XMM4, %%T1
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+                %ifidn  %%ENC_DEC, DEC
+                vmovdqa  %%XMM4, %%T1
+                %endif
+
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+                vpxor    %%XMM5, %%T1
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+                %ifidn  %%ENC_DEC, DEC
+                vmovdqa  %%XMM5, %%T1
+                %endif
+
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+                vpxor    %%XMM6, %%T1
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+                %ifidn  %%ENC_DEC, DEC
+                vmovdqa  %%XMM6, %%T1
+                %endif
+
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+                vpxor    %%XMM7, %%T1
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+                %ifidn  %%ENC_DEC, DEC
+                vmovdqa  %%XMM7, %%T1
+                %endif
+
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+                vpxor    %%XMM8, %%T1
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+                %ifidn  %%ENC_DEC, DEC
+                vmovdqa  %%XMM8, %%T1
+                %endif
+
+                add     %%DATA_OFFSET, 128
+
+                vpshufb  %%XMM1, [SHUF_MASK]             ; perform a 16Byte swap
+                vpxor    %%XMM1, %%T3                   	 ; combine GHASHed value with the corresponding ciphertext
+                vpshufb  %%XMM2, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb  %%XMM3, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb  %%XMM4, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb  %%XMM5, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb  %%XMM6, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb  %%XMM7, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb  %%XMM8, [SHUF_MASK]             ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA - (GCM key data), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; r11 is the data offset value
+%macro	GHASH_8_ENCRYPT_8_PARALLEL 22
+%define	%%GDATA			%1
+%define	%%CYPH_PLAIN_OUT	%2
+%define	%%PLAIN_CYPH_IN		%3
+%define	%%DATA_OFFSET		%4
+%define	%%T1	%5
+%define	%%T2	%6
+%define	%%T3	%7
+%define	%%T4	%8
+%define	%%T5	%9
+%define	%%T6	%10
+%define	%%CTR	%11
+%define	%%XMM1	%12
+%define	%%XMM2	%13
+%define	%%XMM3	%14
+%define	%%XMM4	%15
+%define	%%XMM5	%16
+%define	%%XMM6	%17
+%define	%%XMM7	%18
+%define	%%XMM8	%19
+%define	%%T7	%20
+%define	%%loop_idx	%21
+%define	%%ENC_DEC	%22
+
+        vmovdqa %%T2, %%XMM1
+        vmovdqu [rsp + TMP2], %%XMM2
+        vmovdqu [rsp + TMP3], %%XMM3
+        vmovdqu [rsp + TMP4], %%XMM4
+        vmovdqu [rsp + TMP5], %%XMM5
+        vmovdqu [rsp + TMP6], %%XMM6
+        vmovdqu [rsp + TMP7], %%XMM7
+        vmovdqu [rsp + TMP8], %%XMM8
+
+%ifidn %%loop_idx, in_order
+                vpaddd  %%XMM1, %%CTR,  [ONE]           ; INCR CNT
+                vpaddd  %%XMM2, %%XMM1, [ONE]
+                vpaddd  %%XMM3, %%XMM2, [ONE]
+                vpaddd  %%XMM4, %%XMM3, [ONE]
+                vpaddd  %%XMM5, %%XMM4, [ONE]
+                vpaddd  %%XMM6, %%XMM5, [ONE]
+                vpaddd  %%XMM7, %%XMM6, [ONE]
+                vpaddd  %%XMM8, %%XMM7, [ONE]
+                vmovdqa %%CTR, %%XMM8
+
+                vpshufb %%XMM1, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb %%XMM2, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb %%XMM3, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb %%XMM4, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb %%XMM5, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb %%XMM6, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb %%XMM7, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb %%XMM8, [SHUF_MASK]             ; perform a 16Byte swap
+%else
+                vpaddd  %%XMM1, %%CTR,  [ONEf]                  ; INCR CNT
+                vpaddd  %%XMM2, %%XMM1, [ONEf]
+                vpaddd  %%XMM3, %%XMM2, [ONEf]
+                vpaddd  %%XMM4, %%XMM3, [ONEf]
+                vpaddd  %%XMM5, %%XMM4, [ONEf]
+                vpaddd  %%XMM6, %%XMM5, [ONEf]
+                vpaddd  %%XMM7, %%XMM6, [ONEf]
+                vpaddd  %%XMM8, %%XMM7, [ONEf]
+                vmovdqa %%CTR, %%XMM8
+%endif
+
+
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+                vmovdqu %%T1, [%%GDATA + 16*0]
+                vpxor   %%XMM1, %%T1
+                vpxor   %%XMM2, %%T1
+                vpxor   %%XMM3, %%T1
+                vpxor   %%XMM4, %%T1
+                vpxor   %%XMM5, %%T1
+                vpxor   %%XMM6, %%T1
+                vpxor   %%XMM7, %%T1
+                vpxor   %%XMM8, %%T1
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+                vmovdqu %%T1, [%%GDATA + 16*1]
+                vaesenc %%XMM1, %%T1
+                vaesenc %%XMM2, %%T1
+                vaesenc %%XMM3, %%T1
+                vaesenc %%XMM4, %%T1
+                vaesenc %%XMM5, %%T1
+                vaesenc %%XMM6, %%T1
+                vaesenc %%XMM7, %%T1
+                vaesenc %%XMM8, %%T1
+
+
+                vmovdqu %%T1, [%%GDATA + 16*2]
+                vaesenc %%XMM1, %%T1
+                vaesenc %%XMM2, %%T1
+                vaesenc %%XMM3, %%T1
+                vaesenc %%XMM4, %%T1
+                vaesenc %%XMM5, %%T1
+                vaesenc %%XMM6, %%T1
+                vaesenc %%XMM7, %%T1
+                vaesenc %%XMM8, %%T1
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+        vmovdqu         %%T5, [%%GDATA + HashKey_8]
+        vpclmulqdq      %%T4, %%T2, %%T5, 0x11                  ; %%T4 = a1*b1
+        vpclmulqdq      %%T7, %%T2, %%T5, 0x00                  ; %%T7 = a0*b0
+
+        vpshufd         %%T6, %%T2, 01001110b
+        vpxor           %%T6, %%T2
+
+        vmovdqu         %%T5, [%%GDATA + HashKey_8_k]
+        vpclmulqdq      %%T6, %%T6, %%T5, 0x00                  ;
+
+
+                vmovdqu %%T1, [%%GDATA + 16*3]
+                vaesenc %%XMM1, %%T1
+                vaesenc %%XMM2, %%T1
+                vaesenc %%XMM3, %%T1
+                vaesenc %%XMM4, %%T1
+                vaesenc %%XMM5, %%T1
+                vaesenc %%XMM6, %%T1
+                vaesenc %%XMM7, %%T1
+                vaesenc %%XMM8, %%T1
+
+        vmovdqu         %%T1, [rsp + TMP2]
+        vmovdqu         %%T5, [%%GDATA + HashKey_7]
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
+        vpxor           %%T4, %%T4, %%T3
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T3
+
+        vpshufd         %%T3, %%T1, 01001110b
+        vpxor           %%T3, %%T1
+        vmovdqu         %%T5, [%%GDATA + HashKey_7_k]
+        vpclmulqdq      %%T3, %%T3, %%T5, 0x10
+        vpxor           %%T6, %%T6, %%T3
+
+                vmovdqu %%T1, [%%GDATA + 16*4]
+                vaesenc %%XMM1, %%T1
+                vaesenc %%XMM2, %%T1
+                vaesenc %%XMM3, %%T1
+                vaesenc %%XMM4, %%T1
+                vaesenc %%XMM5, %%T1
+                vaesenc %%XMM6, %%T1
+                vaesenc %%XMM7, %%T1
+                vaesenc %%XMM8, %%T1
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        vmovdqu         %%T1, [rsp + TMP3]
+        vmovdqu         %%T5, [%%GDATA + HashKey_6]
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
+        vpxor           %%T4, %%T4, %%T3
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T3
+
+        vpshufd         %%T3, %%T1, 01001110b
+        vpxor           %%T3, %%T1
+        vmovdqu         %%T5, [%%GDATA + HashKey_6_k]
+        vpclmulqdq      %%T3, %%T3, %%T5, 0x10
+        vpxor           %%T6, %%T6, %%T3
+
+                vmovdqu %%T1, [%%GDATA + 16*5]
+                vaesenc %%XMM1, %%T1
+                vaesenc %%XMM2, %%T1
+                vaesenc %%XMM3, %%T1
+                vaesenc %%XMM4, %%T1
+                vaesenc %%XMM5, %%T1
+                vaesenc %%XMM6, %%T1
+                vaesenc %%XMM7, %%T1
+                vaesenc %%XMM8, %%T1
+
+
+        vmovdqu         %%T1, [rsp + TMP4]
+        vmovdqu         %%T5, [%%GDATA + HashKey_5]
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
+        vpxor           %%T4, %%T4, %%T3
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T3
+
+        vpshufd         %%T3, %%T1, 01001110b
+        vpxor           %%T3, %%T1
+        vmovdqu         %%T5, [%%GDATA + HashKey_5_k]
+        vpclmulqdq      %%T3, %%T3, %%T5, 0x10
+        vpxor           %%T6, %%T6, %%T3
+
+                vmovdqu %%T1, [%%GDATA + 16*6]
+                vaesenc %%XMM1, %%T1
+                vaesenc %%XMM2, %%T1
+                vaesenc %%XMM3, %%T1
+                vaesenc %%XMM4, %%T1
+                vaesenc %%XMM5, %%T1
+                vaesenc %%XMM6, %%T1
+                vaesenc %%XMM7, %%T1
+                vaesenc %%XMM8, %%T1
+
+        vmovdqu         %%T1, [rsp + TMP5]
+        vmovdqu         %%T5, [%%GDATA + HashKey_4]
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
+        vpxor           %%T4, %%T4, %%T3
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T3
+
+        vpshufd         %%T3, %%T1, 01001110b
+        vpxor           %%T3, %%T1
+        vmovdqu         %%T5, [%%GDATA + HashKey_4_k]
+        vpclmulqdq      %%T3, %%T3, %%T5, 0x10
+        vpxor           %%T6, %%T6, %%T3
+
+
+                vmovdqu %%T1, [%%GDATA + 16*7]
+                vaesenc %%XMM1, %%T1
+                vaesenc %%XMM2, %%T1
+                vaesenc %%XMM3, %%T1
+                vaesenc %%XMM4, %%T1
+                vaesenc %%XMM5, %%T1
+                vaesenc %%XMM6, %%T1
+                vaesenc %%XMM7, %%T1
+                vaesenc %%XMM8, %%T1
+
+        vmovdqu         %%T1, [rsp + TMP6]
+        vmovdqu         %%T5, [%%GDATA + HashKey_3]
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
+        vpxor           %%T4, %%T4, %%T3
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T3
+
+        vpshufd         %%T3, %%T1, 01001110b
+        vpxor           %%T3, %%T1
+        vmovdqu         %%T5, [%%GDATA + HashKey_3_k]
+        vpclmulqdq      %%T3, %%T3, %%T5, 0x10
+        vpxor           %%T6, %%T6, %%T3
+
+                vmovdqu %%T1, [%%GDATA + 16*8]
+                vaesenc %%XMM1, %%T1
+                vaesenc %%XMM2, %%T1
+                vaesenc %%XMM3, %%T1
+                vaesenc %%XMM4, %%T1
+                vaesenc %%XMM5, %%T1
+                vaesenc %%XMM6, %%T1
+                vaesenc %%XMM7, %%T1
+                vaesenc %%XMM8, %%T1
+
+        vmovdqu         %%T1, [rsp + TMP7]
+        vmovdqu         %%T5, [%%GDATA + HashKey_2]
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
+        vpxor           %%T4, %%T4, %%T3
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T3
+
+        vpshufd         %%T3, %%T1, 01001110b
+        vpxor           %%T3, %%T1
+        vmovdqu         %%T5, [%%GDATA + HashKey_2_k]
+        vpclmulqdq      %%T3, %%T3, %%T5, 0x10
+        vpxor           %%T6, %%T6, %%T3
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+                vmovdqu %%T5, [%%GDATA + 16*9]
+                vaesenc %%XMM1, %%T5
+                vaesenc %%XMM2, %%T5
+                vaesenc %%XMM3, %%T5
+                vaesenc %%XMM4, %%T5
+                vaesenc %%XMM5, %%T5
+                vaesenc %%XMM6, %%T5
+                vaesenc %%XMM7, %%T5
+                vaesenc %%XMM8, %%T5
+
+        vmovdqu         %%T1, [rsp + TMP8]
+        vmovdqu         %%T5, [%%GDATA + HashKey]
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
+        vpxor           %%T4, %%T4, %%T3
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T3
+
+        vpshufd         %%T3, %%T1, 01001110b
+        vpxor           %%T3, %%T1
+        vmovdqu         %%T5, [%%GDATA + HashKey_k]
+        vpclmulqdq      %%T3, %%T3, %%T5, 0x10
+        vpxor           %%T6, %%T6, %%T3
+
+        vpxor           %%T6, %%T4
+        vpxor           %%T6, %%T7
+
+%ifdef GCM128_MODE
+		vmovdqu         %%T5, [%%GDATA + 16*10]
+%endif
+%ifdef GCM192_MODE
+                vmovdqu		%%T5, [%%GDATA + 16*10]
+		vaesenc	%%XMM1, %%T5
+		vaesenc	%%XMM2, %%T5
+		vaesenc	%%XMM3, %%T5
+		vaesenc	%%XMM4, %%T5
+		vaesenc	%%XMM5, %%T5
+		vaesenc	%%XMM6, %%T5
+		vaesenc	%%XMM7, %%T5
+		vaesenc	%%XMM8, %%T5
+
+		vmovdqu		%%T5, [%%GDATA + 16*11]
+		vaesenc	%%XMM1, %%T5
+		vaesenc	%%XMM2, %%T5
+		vaesenc	%%XMM3, %%T5
+		vaesenc	%%XMM4, %%T5
+		vaesenc	%%XMM5, %%T5
+		vaesenc	%%XMM6, %%T5
+		vaesenc	%%XMM7, %%T5
+		vaesenc	%%XMM8, %%T5
+
+		vmovdqu		%%T5, [%%GDATA + 16*12]
+%endif
+%ifdef GCM256_MODE
+                vmovdqu		%%T5, [%%GDATA + 16*10]
+		vaesenc	%%XMM1, %%T5
+		vaesenc	%%XMM2, %%T5
+		vaesenc	%%XMM3, %%T5
+		vaesenc	%%XMM4, %%T5
+		vaesenc	%%XMM5, %%T5
+		vaesenc	%%XMM6, %%T5
+		vaesenc	%%XMM7, %%T5
+		vaesenc	%%XMM8, %%T5
+
+		vmovdqu		%%T5, [%%GDATA + 16*11]
+		vaesenc	%%XMM1, %%T5
+		vaesenc	%%XMM2, %%T5
+		vaesenc	%%XMM3, %%T5
+		vaesenc	%%XMM4, %%T5
+		vaesenc	%%XMM5, %%T5
+		vaesenc	%%XMM6, %%T5
+		vaesenc	%%XMM7, %%T5
+		vaesenc	%%XMM8, %%T5
+
+		vmovdqu		%%T5, [%%GDATA + 16*12]
+		vaesenc	%%XMM1, %%T5
+		vaesenc	%%XMM2, %%T5
+		vaesenc	%%XMM3, %%T5
+		vaesenc	%%XMM4, %%T5
+		vaesenc	%%XMM5, %%T5
+		vaesenc	%%XMM6, %%T5
+		vaesenc	%%XMM7, %%T5
+		vaesenc	%%XMM8, %%T5
+
+		vmovdqu		%%T5, [%%GDATA + 16*13]
+		vaesenc	%%XMM1, %%T5
+		vaesenc	%%XMM2, %%T5
+		vaesenc	%%XMM3, %%T5
+		vaesenc	%%XMM4, %%T5
+		vaesenc	%%XMM5, %%T5
+		vaesenc	%%XMM6, %%T5
+		vaesenc	%%XMM7, %%T5
+		vaesenc	%%XMM8, %%T5
+
+		vmovdqu		%%T5, [%%GDATA + 16*14]
+%endif
+
+%assign i 0
+%assign j 1
+%rep 8
+
+%ifidn %%ENC_DEC, ENC
+%ifdef	NT_LD
+		VXLDR	%%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i]
+		vpxor	%%T2, %%T2, %%T5
+%else
+		vpxor   %%T2, %%T5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i]
+%endif                          ; NT_LD
+		vaesenclast     reg(j), reg(j), %%T2
+%else
+		VXLDR	%%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i]
+		vpxor	%%T2, %%T2, %%T5
+		vaesenclast     %%T3, reg(j), %%T2
+		vpxor	reg(j), %%T2, %%T5
+		VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*i], %%T3
+%endif                          ; %%ENC_DEC
+
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+        vpslldq %%T3, %%T6, 8           ; shift-L %%T3 2 DWs
+        vpsrldq %%T6, %%T6, 8           ; shift-R %%T2 2 DWs
+        vpxor   %%T7, %%T3
+        vpxor   %%T6, %%T4              ; accumulate the results in %%T6:%%T7
+
+
+        ;first phase of the reduction
+
+        vpslld  %%T2, %%T7, 31                                  ; packed right shifting << 31
+        vpslld  %%T3, %%T7, 30                                  ; packed right shifting shift << 30
+        vpslld  %%T4, %%T7, 25                                  ; packed right shifting shift << 25
+
+        vpxor   %%T2, %%T2, %%T3                                ; xor the shifted versions
+        vpxor   %%T2, %%T2, %%T4
+
+        vpsrldq %%T1, %%T2, 4                                   ; shift-R %%T1 1 DW
+
+        vpslldq %%T2, %%T2, 12                                  ; shift-L %%T2 3 DWs
+        vpxor   %%T7, %%T2                                      ; first phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+		%ifidn %%ENC_DEC, ENC
+		VXSTR	[%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1			; Write to the Ciphertext buffer
+		VXSTR	[%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2			; Write to the Ciphertext buffer
+		VXSTR	[%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3			; Write to the Ciphertext buffer
+		VXSTR	[%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4			; Write to the Ciphertext buffer
+		VXSTR	[%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5			; Write to the Ciphertext buffer
+		VXSTR	[%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6			; Write to the Ciphertext buffer
+		VXSTR	[%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7			; Write to the Ciphertext buffer
+		VXSTR	[%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8			; Write to the Ciphertext buffer
+                %endif
+
+        ;second phase of the reduction
+
+        vpsrld  %%T2,%%T7,1                                     ; packed left shifting >> 1
+        vpsrld  %%T3,%%T7,2                                     ; packed left shifting >> 2
+        vpsrld  %%T4,%%T7,7                                     ; packed left shifting >> 7
+        vpxor   %%T2, %%T2,%%T3                                 ; xor the shifted versions
+        vpxor   %%T2, %%T2,%%T4
+
+        vpxor   %%T2, %%T2, %%T1
+        vpxor   %%T7, %%T7, %%T2
+        vpxor   %%T6, %%T6, %%T7                                ; the result is in %%T6
+
+
+
+                vpshufb %%XMM1, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb %%XMM2, [SHUF_MASK]
+                vpshufb %%XMM3, [SHUF_MASK]
+                vpshufb %%XMM4, [SHUF_MASK]
+                vpshufb %%XMM5, [SHUF_MASK]
+                vpshufb %%XMM6, [SHUF_MASK]
+                vpshufb %%XMM7, [SHUF_MASK]
+                vpshufb %%XMM8, [SHUF_MASK]
+
+
+        vpxor   %%XMM1, %%T6
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+; %%GDATA is GCM key data
+%macro	GHASH_LAST_8 16
+%define	%%GDATA	%1
+%define	%%T1	%2
+%define	%%T2	%3
+%define	%%T3	%4
+%define	%%T4	%5
+%define	%%T5	%6
+%define	%%T6	%7
+%define	%%T7	%8
+%define	%%XMM1	%9
+%define	%%XMM2	%10
+%define	%%XMM3	%11
+%define	%%XMM4	%12
+%define	%%XMM5	%13
+%define	%%XMM6	%14
+%define	%%XMM7	%15
+%define	%%XMM8	%16
+        ;; Karatsuba Method
+
+
+        vpshufd         %%T2, %%XMM1, 01001110b
+        vpxor           %%T2, %%XMM1
+        vmovdqu         %%T5, [%%GDATA + HashKey_8]
+        vpclmulqdq      %%T6, %%XMM1, %%T5, 0x11
+        vpclmulqdq      %%T7, %%XMM1, %%T5, 0x00
+
+        vmovdqu         %%T3, [%%GDATA + HashKey_8_k]
+        vpclmulqdq      %%XMM1, %%T2, %%T3, 0x00
+
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+
+        vpshufd         %%T2, %%XMM2, 01001110b
+        vpxor           %%T2, %%XMM2
+        vmovdqu         %%T5, [%%GDATA + HashKey_7]
+        vpclmulqdq      %%T4, %%XMM2, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM2, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vmovdqu         %%T3, [%%GDATA + HashKey_7_k]
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+        vpxor           %%XMM1, %%XMM1, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+
+        vpshufd         %%T2, %%XMM3, 01001110b
+        vpxor           %%T2, %%XMM3
+        vmovdqu         %%T5, [%%GDATA + HashKey_6]
+        vpclmulqdq      %%T4, %%XMM3, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM3, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vmovdqu         %%T3, [%%GDATA + HashKey_6_k]
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+        vpxor           %%XMM1, %%XMM1, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+
+        vpshufd         %%T2, %%XMM4, 01001110b
+        vpxor           %%T2, %%XMM4
+        vmovdqu         %%T5, [%%GDATA + HashKey_5]
+        vpclmulqdq      %%T4, %%XMM4, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM4, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vmovdqu         %%T3, [%%GDATA + HashKey_5_k]
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+        vpxor           %%XMM1, %%XMM1, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+        vpshufd         %%T2, %%XMM5, 01001110b
+        vpxor           %%T2, %%XMM5
+        vmovdqu         %%T5, [%%GDATA + HashKey_4]
+        vpclmulqdq      %%T4, %%XMM5, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM5, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vmovdqu         %%T3, [%%GDATA + HashKey_4_k]
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+        vpxor           %%XMM1, %%XMM1, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+        vpshufd         %%T2, %%XMM6, 01001110b
+        vpxor           %%T2, %%XMM6
+        vmovdqu         %%T5, [%%GDATA + HashKey_3]
+
+        vpclmulqdq      %%T4, %%XMM6, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM6, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vmovdqu         %%T3, [%%GDATA + HashKey_3_k]
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+        vpxor           %%XMM1, %%XMM1, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+        vpshufd         %%T2, %%XMM7, 01001110b
+        vpxor           %%T2, %%XMM7
+        vmovdqu         %%T5, [%%GDATA + HashKey_2]
+        vpclmulqdq      %%T4, %%XMM7, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM7, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vmovdqu         %%T3, [%%GDATA + HashKey_2_k]
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+        vpxor           %%XMM1, %%XMM1, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+        vpshufd         %%T2, %%XMM8, 01001110b
+        vpxor           %%T2, %%XMM8
+        vmovdqu         %%T5, [%%GDATA + HashKey]
+        vpclmulqdq      %%T4, %%XMM8, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM8, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vmovdqu         %%T3, [%%GDATA + HashKey_k]
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+
+        vpxor           %%XMM1, %%XMM1, %%T2
+        vpxor           %%XMM1, %%XMM1, %%T6
+        vpxor           %%T2, %%XMM1, %%T7
+
+
+
+
+        vpslldq         %%T4, %%T2, 8
+        vpsrldq         %%T2, %%T2, 8
+
+        vpxor           %%T7, %%T4
+        vpxor           %%T6, %%T2                                      ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+        ;first phase of the reduction
+
+        vpslld          %%T2, %%T7, 31                                  ; packed right shifting << 31
+        vpslld          %%T3, %%T7, 30                                  ; packed right shifting shift << 30
+        vpslld          %%T4, %%T7, 25                                  ; packed right shifting shift << 25
+
+        vpxor           %%T2, %%T2, %%T3                                ; xor the shifted versions
+        vpxor           %%T2, %%T2, %%T4
+
+        vpsrldq         %%T1, %%T2, 4                                   ; shift-R %%T1 1 DW
+
+        vpslldq         %%T2, %%T2, 12                                  ; shift-L %%T2 3 DWs
+        vpxor           %%T7, %%T2                                      ; first phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+        ;second phase of the reduction
+
+        vpsrld          %%T2,%%T7,1                                     ; packed left shifting >> 1
+        vpsrld          %%T3,%%T7,2                                     ; packed left shifting >> 2
+        vpsrld          %%T4,%%T7,7                                     ; packed left shifting >> 7
+        vpxor           %%T2, %%T2,%%T3                                 ; xor the shifted versions
+        vpxor           %%T2, %%T2,%%T4
+
+        vpxor           %%T2, %%T2, %%T1
+        vpxor           %%T7, %%T7, %%T2
+        vpxor           %%T6, %%T6, %%T7                                ; the result is in %%T6
+
+
+%endmacro
+
+
+; Encryption of a single block
+; %%GDATA is GCM key data
+%macro	ENCRYPT_SINGLE_BLOCK 2
+%define	%%GDATA	%1
+%define	%%XMM0	%2
+
+                vpxor    %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep NROUNDS
+                vaesenc  %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep                         ; NROUNDS
+                vaesenclast      %%XMM0, [%%GDATA+16*i]
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+	;; Required for Update/GMC_ENC
+	;the number of pushes must equal STACK_OFFSET
+	push    r12
+	push    r13
+	push    r14
+	push    r15
+	push    rsi
+	mov     r14, rsp
+
+	sub     rsp, VARIABLE_OFFSET
+	and     rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+        ; xmm6:xmm15 need to be maintained for Windows
+	vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+	vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+	vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+	vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+	vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+	vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+	vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+	vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+	vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+	vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+
+	mov	arg5, arg(5) ;[r14 + STACK_OFFSET + 8*5]
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqu xmm15  , [rsp + LOCAL_STORAGE + 9*16]
+	vmovdqu xmm14  , [rsp + LOCAL_STORAGE + 8*16]
+	vmovdqu xmm13  , [rsp + LOCAL_STORAGE + 7*16]
+	vmovdqu xmm12  , [rsp + LOCAL_STORAGE + 6*16]
+	vmovdqu xmm11  , [rsp + LOCAL_STORAGE + 5*16]
+	vmovdqu xmm10  , [rsp + LOCAL_STORAGE + 4*16]
+	vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+	vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+	vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+	vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+	mov     rsp, r14
+	pop     rsi
+	pop     r15
+	pop     r14
+	pop     r13
+	pop     r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
+; Input: struct gcm_key_data *(GDATA_KEY), struct gcm_context_data *(GDATA_CTX),
+;        IV, Additional Authentication data (A_IN), Additional
+; Data length (A_LEN)
+; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13, and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro	GCM_INIT 	5
+%define	%%GDATA_KEY	%1
+%define	%%GDATA_CTX	%2
+%define	%%IV		%3
+%define	%%A_IN		%4
+%define	%%A_LEN		%5
+%define	%%AAD_HASH	xmm0
+%define	%%SUBHASH	xmm1
+
+
+	vmovdqu	%%SUBHASH, [%%GDATA_KEY + HashKey]
+
+	CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+	vpxor	xmm2, xmm3
+	mov	r10, %%A_LEN
+
+	vmovdqu	[%%GDATA_CTX + AadHash], %%AAD_HASH	; ctx_data.aad hash = aad_hash
+	mov	[%%GDATA_CTX + AadLen], r10		; ctx_data.aad_length = aad_length
+	xor	r10, r10
+	mov	[%%GDATA_CTX + InLen], r10		; ctx_data.in_length = 0
+	mov	[%%GDATA_CTX + PBlockLen], r10		; ctx_data.partial_block_length = 0
+	vmovdqu	[%%GDATA_CTX + PBlockEncKey], xmm2	; ctx_data.partial_block_enc_key = 0
+	mov	r10, %%IV
+        vmovdqa xmm2, [rel ONEf]                        ; read 12 IV bytes and pad with 0x00000001
+        vpinsrq xmm2, [r10], 0
+        vpinsrd xmm2, [r10+8], 2
+	vmovdqu	[%%GDATA_CTX + OrigIV], xmm2		; ctx_data.orig_IV = iv
+
+	vpshufb xmm2, [SHUF_MASK]
+
+	vmovdqu	[%%GDATA_CTX + CurCount], xmm2		; ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
+; has been initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data * (GDATA_CTX),
+;        input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro	GCM_ENC_DEC 		6
+%define	%%GDATA_KEY		%1
+%define	%%GDATA_CTX		%2
+%define	%%CYPH_PLAIN_OUT	%3
+%define	%%PLAIN_CYPH_IN		%4
+%define	%%PLAIN_CYPH_LEN	%5
+%define	%%ENC_DEC		%6
+%define	%%DATA_OFFSET		r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+	cmp	%%PLAIN_CYPH_LEN, 0
+	je	%%_multiple_of_16_bytes
+
+	xor %%DATA_OFFSET, %%DATA_OFFSET
+	add [%%GDATA_CTX+InLen], %%PLAIN_CYPH_LEN       ; Update length of data processed
+	vmovdqu  xmm13, [%%GDATA_KEY + HashKey]         ; xmm13 = HashKey
+	vmovdqu xmm8, [%%GDATA_CTX + AadHash]
+
+
+	PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+
+	mov	r13, %%PLAIN_CYPH_LEN
+	sub	r13, %%DATA_OFFSET
+	mov	r10, r13				; save the amount of data left to process in r10
+	and     r13, -16                                ; r13 = r13 - (r13 mod 16)
+
+        mov     r12, r13
+        shr     r12, 4
+        and     r12, 7
+
+        jz      %%_initial_num_blocks_is_0
+
+        cmp     r12, 7
+        je      %%_initial_num_blocks_is_7
+        cmp     r12, 6
+        je      %%_initial_num_blocks_is_6
+        cmp     r12, 5
+        je      %%_initial_num_blocks_is_5
+        cmp     r12, 4
+        je      %%_initial_num_blocks_is_4
+        cmp     r12, 3
+        je      %%_initial_num_blocks_is_3
+        cmp     r12, 2
+        je      %%_initial_num_blocks_is_2
+
+        jmp     %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16*7
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16*6
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16*5
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16*4
+        jmp     %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16*3
+        jmp     %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16*2
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+        cmp     r13, 0
+        je      %%_zero_cipher_left
+
+        sub     r13, 128
+        je      %%_eight_cipher_left
+
+
+
+
+        vmovd    r15d, xmm9
+        and     r15d, 255
+        vpshufb  xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+        cmp     r15d, 255-8
+        jg      %%_encrypt_by_8
+
+
+
+        add     r15b, 8
+	GHASH_8_ENCRYPT_8_PARALLEL	%%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+        add     %%DATA_OFFSET, 128
+        sub     r13, 128
+        jne     %%_encrypt_by_8_new
+
+        vpshufb  xmm9, [SHUF_MASK]
+        jmp     %%_eight_cipher_left
+
+%%_encrypt_by_8:
+        vpshufb  xmm9, [SHUF_MASK]
+        add     r15b, 8
+	GHASH_8_ENCRYPT_8_PARALLEL	%%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN,%%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+        vpshufb  xmm9, [SHUF_MASK]
+        add     %%DATA_OFFSET, 128
+        sub     r13, 128
+        jne     %%_encrypt_by_8_new
+
+        vpshufb  xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+	GHASH_LAST_8	%%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+	vmovdqu	[%%GDATA_CTX + AadHash], xmm14		; ctx_data.aad hash = xmm14
+	vmovdqu	[%%GDATA_CTX + CurCount], xmm9		; ctx_data.current_counter = xmm9
+
+        mov     r13, r10
+        and     r13, 15                                 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+        je      %%_multiple_of_16_bytes
+
+	mov	[%%GDATA_CTX + PBlockLen], r13		; ctx_data.partial_blck_length = r13
+        ; handle the last <16 Byte block seperately
+
+        vpaddd   xmm9, [ONE]                            ; INCR CNT to get Yn
+        vmovdqu [%%GDATA_CTX + CurCount], xmm9          ; my_ctx_data.current_counter = xmm9
+        vpshufb  xmm9, [SHUF_MASK]
+        ENCRYPT_SINGLE_BLOCK   %%GDATA_KEY, xmm9        ; E(K, Yn)
+	vmovdqu	[%%GDATA_CTX + PBlockEncKey], xmm9	; ctx_data.partial_block_enc_key = xmm9
+
+	cmp	%%PLAIN_CYPH_LEN, 16
+	jge	%%_large_enough_update
+
+	lea	r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+	READ_SMALL_DATA_INPUT	xmm1, r10, r13, r12, r15, rax
+	lea	r12, [SHIFT_MASK + 16]
+        sub     r12, r13
+	jmp	%%_data_read
+
+%%_large_enough_update:
+        sub     %%DATA_OFFSET, 16
+        add     %%DATA_OFFSET, r13
+
+        vmovdqu  xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET]	; receive the last <16 Byte block
+
+	sub     %%DATA_OFFSET, r13
+        add     %%DATA_OFFSET, 16
+
+
+        lea     r12, [SHIFT_MASK + 16]
+        sub     r12, r13                                ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+
+        vmovdqu  xmm2, [r12]                             ; get the appropriate shuffle mask
+        vpshufb  xmm1, xmm2                              ; shift right 16-r13 bytes
+%%_data_read:
+%ifidn  %%ENC_DEC, DEC
+        vmovdqa  xmm2, xmm1
+        vpxor    xmm9, xmm1                              ; Plaintext XOR E(K, Yn)
+        vmovdqu  xmm1, [r12 + ALL_F - SHIFT_MASK]        ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+        vpand    xmm9, xmm1                              ; mask out top 16-r13 bytes of xmm9
+        vpand    xmm2, xmm1
+        vpshufb  xmm2, [SHUF_MASK]
+        vpxor    xmm14, xmm2
+        vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+%else
+        vpxor    xmm9, xmm1                              ; Plaintext XOR E(K, Yn)
+        vmovdqu  xmm1, [r12 + ALL_F - SHIFT_MASK]        ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+        vpand    xmm9, xmm1                              ; mask out top 16-r13 bytes of xmm9
+        vpshufb  xmm9, [SHUF_MASK]
+        vpxor    xmm14, xmm9
+	vmovdqu	[%%GDATA_CTX + AadHash], xmm14
+
+        vpshufb  xmm9, [SHUF_MASK]               ; shuffle xmm9 back to output as ciphertext
+%endif
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ; output r13 Bytes
+        vmovq    rax, xmm9
+        cmp     r13, 8
+        jle     %%_less_than_8_bytes_left
+
+        mov     [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+        add     %%DATA_OFFSET, 8
+        vpsrldq xmm9, xmm9, 8
+        vmovq   rax, xmm9
+        sub     r13, 8
+
+%%_less_than_8_bytes_left:
+        mov     BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+        add     %%DATA_OFFSET, 1
+        shr     rax, 8
+        sub     r13, 1
+        jne     %%_less_than_8_bytes_left
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data *(GDATA_CTX) and
+;        whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro	GCM_COMPLETE		5
+%define	%%GDATA_KEY		%1
+%define	%%GDATA_CTX		%2
+%define	%%AUTH_TAG		%3
+%define	%%AUTH_TAG_LEN		%4
+%define	%%ENC_DEC		%5
+%define	%%PLAIN_CYPH_LEN	rax
+
+	mov	r12, [%%GDATA_CTX + PBlockLen]
+	vmovdqu	xmm14, [%%GDATA_CTX + AadHash]
+	vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+
+	cmp	r12, 0
+
+	je      %%_partial_done
+
+	GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+	vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+%%_partial_done:
+
+        mov     r12, [%%GDATA_CTX + AadLen]             ; r12 = aadLen (number of bytes)
+	mov	%%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
+
+        shl     r12, 3                                  ; convert into number of bits
+        vmovd   xmm15, r12d                             ; len(A) in xmm15
+
+        shl     %%PLAIN_CYPH_LEN, 3                     ; len(C) in bits  (*128)
+        vmovq   xmm1, %%PLAIN_CYPH_LEN
+        vpslldq xmm15, xmm15, 8                         ; xmm15 = len(A)|| 0x0000000000000000
+        vpxor   xmm15, xmm1                             ; xmm15 = len(A)||len(C)
+
+        vpxor   xmm14, xmm15
+        GHASH_MUL       xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6    ; final GHASH computation
+        vpshufb xmm14, [SHUF_MASK]                      ; perform a 16Byte swap
+
+        vmovdqu xmm9, [%%GDATA_CTX + OrigIV]            ; xmm9 = Y0
+
+        ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9          ; E(K, Y0)
+
+        vpxor   xmm9, xmm14
+
+
+%%_return_T:
+        mov     r10, %%AUTH_TAG             ; r10 = authTag
+        mov     r11, %%AUTH_TAG_LEN         ; r11 = auth_tag_len
+
+        cmp     r11, 16
+        je      %%_T_16
+
+        cmp     r11, 12
+        je      %%_T_12
+
+%%_T_8:
+        vmovq   rax, xmm9
+        mov     [r10], rax
+        jmp     %%_return_T_done
+%%_T_12:
+        vmovq   rax, xmm9
+        mov     [r10], rax
+        vpsrldq xmm9, xmm9, 8
+        vmovd   eax, xmm9
+        mov     [r10 + 8], eax
+        jmp     %%_return_T_done
+
+%%_T_16:
+        vmovdqu  [r10], xmm9
+
+%%_return_T_done:
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_precomp_128_avx_gen2
+;        (struct gcm_key_data *key_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(precomp,_)
+FN_NAME(precomp,_):
+	endbranch
+
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+
+        mov     r14, rsp
+
+
+
+        sub     rsp, VARIABLE_OFFSET
+        and     rsp, ~63                                ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+        ; only xmm6 needs to be maintained
+        vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+	vpxor	xmm6, xmm6
+	ENCRYPT_SINGLE_BLOCK	arg1, xmm6		; xmm6 = HashKey
+
+        vpshufb  xmm6, [SHUF_MASK]
+        ;;;;;;;;;;;;;;;  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+        vmovdqa  xmm2, xmm6
+        vpsllq   xmm6, 1
+        vpsrlq   xmm2, 63
+        vmovdqa  xmm1, xmm2
+        vpslldq  xmm2, xmm2, 8
+        vpsrldq  xmm1, xmm1, 8
+        vpor     xmm6, xmm2
+        ;reduction
+        vpshufd  xmm2, xmm1, 00100100b
+        vpcmpeqd xmm2, [TWOONE]
+        vpand    xmm2, [POLY]
+        vpxor    xmm6, xmm2                             ; xmm6 holds the HashKey<<1 mod poly
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        vmovdqu  [arg1 + HashKey], xmm6                  ; store HashKey<<1 mod poly
+
+
+        PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+        vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+        mov     rsp, r14
+
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        ret
+%endif	; _nt
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_init_128_avx_gen2(
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *iv,
+;        const   u8 *aad,
+;        u64     aad_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(init,_)
+FN_NAME(init,_):
+	endbranch
+
+        push	r12
+	push	r13
+
+%ifidn __OUTPUT_FORMAT__, win64
+	; xmm6:xmm15 need to be maintained for Windows
+        push    arg5
+	sub	rsp, 1*16
+	vmovdqu	[rsp + 0*16],xmm6
+        mov     arg5, [rsp + 1*16 + 8*3 + 8*5]
+%endif
+
+	GCM_INIT arg1, arg2, arg3, arg4, arg5
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqu	xmm6 , [rsp + 0*16]
+	add	rsp, 1*16
+        pop     arg5
+%endif
+	pop	r13
+	pop	r12
+ret
+%endif	; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_enc_128_update_avx_gen2(
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *out,
+;        const   u8 *in,
+;        u64     plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(enc,_update_)
+FN_NAME(enc,_update_):
+	endbranch
+
+	FUNC_SAVE
+
+	GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
+
+	FUNC_RESTORE
+
+	ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_dec_128_update_avx_gen2(
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *out,
+;        const   u8 *in,
+;        u64     plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(dec,_update_)
+FN_NAME(dec,_update_):
+	endbranch
+
+        FUNC_SAVE
+
+	GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
+
+	FUNC_RESTORE
+
+	ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_enc_128_finalize_avx_gen2(
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *auth_tag,
+;        u64     auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(enc,_finalize_)
+FN_NAME(enc,_finalize_):
+	endbranch
+
+	push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+	; xmm6:xmm15 need to be maintained for Windows
+	sub	rsp, 5*16
+	vmovdqu	[rsp + 0*16],xmm6
+	vmovdqu	[rsp + 1*16],xmm9
+	vmovdqu	[rsp + 2*16],xmm11
+	vmovdqu	[rsp + 3*16],xmm14
+	vmovdqu	[rsp + 4*16],xmm15
+%endif
+	GCM_COMPLETE	arg1, arg2, arg3, arg4, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqu	xmm15  , [rsp + 4*16]
+	vmovdqu	xmm14  , [rsp + 3*16]
+	vmovdqu	xmm11  , [rsp + 2*16]
+	vmovdqu	xmm9 , [rsp + 1*16]
+	vmovdqu	xmm6 , [rsp + 0*16]
+	add	rsp, 5*16
+%endif
+
+	pop r12
+        ret
+%endif	; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_dec_128_finalize_avx_gen2(
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *auth_tag,
+;        u64     auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(dec,_finalize_)
+FN_NAME(dec,_finalize_):
+	endbranch
+
+	push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+	; xmm6:xmm15 need to be maintained for Windows
+	sub	rsp, 5*16
+	vmovdqu	[rsp + 0*16],xmm6
+	vmovdqu	[rsp + 1*16],xmm9
+	vmovdqu	[rsp + 2*16],xmm11
+	vmovdqu	[rsp + 3*16],xmm14
+	vmovdqu	[rsp + 4*16],xmm15
+%endif
+	GCM_COMPLETE	arg1, arg2, arg3, arg4, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqu	xmm15  , [rsp + 4*16]
+	vmovdqu	xmm14  , [rsp + 3*16]
+	vmovdqu	xmm11  , [rsp + 2*16]
+	vmovdqu	xmm9 , [rsp + 1*16]
+	vmovdqu	xmm6 , [rsp + 0*16]
+	add	rsp, 5*16
+%endif
+
+	pop r12
+ret
+%endif	; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_enc_128_avx_gen2(
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *out,
+;        const   u8 *in,
+;        u64     plaintext_len,
+;        u8      *iv,
+;        const   u8 *aad,
+;        u64     aad_len,
+;        u8      *auth_tag,
+;        u64     auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(enc,_)
+FN_NAME(enc,_):
+	endbranch
+
+	FUNC_SAVE
+
+	GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+	GCM_ENC_DEC  arg1, arg2, arg3, arg4, arg5, ENC
+
+	GCM_COMPLETE arg1, arg2, arg9, arg10, ENC
+
+	FUNC_RESTORE
+
+	ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_dec_128_avx_gen2(
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *out,
+;        const   u8 *in,
+;        u64     plaintext_len,
+;        u8      *iv,
+;        const   u8 *aad,
+;        u64     aad_len,
+;        u8      *auth_tag,
+;        u64     auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(dec,_)
+FN_NAME(dec,_):
+	endbranch
+
+	FUNC_SAVE
+
+	GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+	GCM_ENC_DEC  arg1, arg2, arg3, arg4, arg5, DEC
+
+	GCM_COMPLETE arg1, arg2, arg9, arg10, DEC
+
+	FUNC_RESTORE
+
+	ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen4.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen4.asm
new file mode 100644
index 000000000..4a0b4f82e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen4.asm
@@ -0,0 +1,3277 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+;       Erdinc Ozturk
+;       Vinodh Gopal
+;       James Guilford
+;
+;
+; References:
+;       This code was derived and highly optimized from the code described in paper:
+;               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;       The details of the implementation is explained in:
+;               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+;       0                   1                   2                   3
+;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                             Salt  (From the SA)               |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                     Initialization Vector                     |
+;       |         (This is the sequence number from IPSec header)       |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                              0x1                              |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+;       AAD will be padded with 0 to the next 16byte multiple
+;       for example, assume AAD is a u32 vector
+;
+;       if AAD is 8 bytes:
+;       AAD[3] = {A0, A1};
+;       padded AAD in xmm register = {A1 A0 0 0}
+;
+;       0                   1                   2                   3
+;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                               SPI (A1)                        |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                     32-bit Sequence Number (A0)               |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                              0x0                              |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;                                       AAD Format with 32-bit Sequence Number
+;
+;       if AAD is 12 bytes:
+;       AAD[3] = {A0, A1, A2};
+;       padded AAD in xmm register = {A2 A1 A0 0}
+;
+;       0                   1                   2                   3
+;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                               SPI (A2)                        |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                 64-bit Extended Sequence Number {A1,A0}       |
+;       |                                                               |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                              0x0                              |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;        AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+;       Must be a multiple of 4 bytes and from the definition of the spec.
+;       The code additionally supports any aadLen length.
+;
+; TLen:
+;       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+%ifndef GCM128_MODE
+%ifndef GCM192_MODE
+%ifndef GCM256_MODE
+%error "No GCM mode selected for gcm_avx_gen4.asm!"
+%endif
+%endif
+%endif
+
+%ifndef FUNCT_EXTENSION
+%define FUNCT_EXTENSION
+%endif
+
+;; Decide on AES-GCM key size to compile for
+%ifdef GCM128_MODE
+%define NROUNDS 9
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen4 %+ FUNCT_EXTENSION
+%endif
+
+%ifdef GCM192_MODE
+%define NROUNDS 11
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen4 %+ FUNCT_EXTENSION
+%endif
+
+%ifdef GCM256_MODE
+%define NROUNDS 13
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen4 %+ FUNCT_EXTENSION
+%endif
+
+section .text
+default rel
+
+; need to push 5 registers into stack to maintain
+%define STACK_OFFSET 8*5
+
+%define TMP2    16*0    ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3    16*1    ; Temporary storage for AES State 3
+%define TMP4    16*2    ; Temporary storage for AES State 4
+%define TMP5    16*3    ; Temporary storage for AES State 5
+%define TMP6    16*4    ; Temporary storage for AES State 6
+%define TMP7    16*5    ; Temporary storage for AES State 7
+%define TMP8    16*6    ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE   16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+        %define XMM_STORAGE     16*10
+%else
+        %define XMM_STORAGE     0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro  GHASH_MUL  7
+%define %%GH %1         ; 16 Bytes
+%define %%HK %2         ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+        vpclmulqdq      %%T1, %%GH, %%HK, 0x11          ; %%T1 = a1*b1
+        vpclmulqdq      %%T2, %%GH, %%HK, 0x00          ; %%T2 = a0*b0
+        vpclmulqdq      %%T3, %%GH, %%HK, 0x01          ; %%T3 = a1*b0
+        vpclmulqdq      %%GH, %%GH, %%HK, 0x10          ; %%GH = a0*b1
+        vpxor           %%GH, %%GH, %%T3
+
+
+        vpsrldq         %%T3, %%GH, 8                   ; shift-R %%GH 2 DWs
+        vpslldq         %%GH, %%GH, 8                   ; shift-L %%GH 2 DWs
+
+        vpxor           %%T1, %%T1, %%T3
+        vpxor           %%GH, %%GH, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;first phase of the reduction
+        vmovdqu         %%T3, [POLY2]
+
+        vpclmulqdq      %%T2, %%T3, %%GH, 0x01
+        vpslldq         %%T2, %%T2, 8                    ; shift-L %%T2 2 DWs
+
+        vpxor           %%GH, %%GH, %%T2                 ; first phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;second phase of the reduction
+        vpclmulqdq      %%T2, %%T3, %%GH, 0x00
+        vpsrldq         %%T2, %%T2, 4                    ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+        vpclmulqdq      %%GH, %%T3, %%GH, 0x10
+        vpslldq         %%GH, %%GH, 4                    ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+        vpxor           %%GH, %%GH, %%T2                 ; second phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        vpxor           %%GH, %%GH, %%T1                 ; the result is in %%GH
+
+%endmacro
+
+
+; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx_gen4
+; functions, but are kept to allow users to switch cpu architectures between calls
+; of pre, init, update, and finalize.
+%macro  PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK    %2
+%define %%T1    %3
+%define %%T2    %4
+%define %%T3    %5
+%define %%T4    %6
+%define %%T5    %7
+%define %%T6    %8
+
+        ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+        vmovdqa  %%T5, %%HK
+
+        vpshufd  %%T1, %%T5, 01001110b
+        vpxor    %%T1, %%T5
+        vmovdqu  [%%GDATA + HashKey_k], %%T1
+
+        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^2<<1 mod poly
+        vmovdqu  [%%GDATA + HashKey_2], %%T5                    ;  [HashKey_2] = HashKey^2<<1 mod poly
+        vpshufd  %%T1, %%T5, 01001110b
+        vpxor    %%T1, %%T5
+        vmovdqu  [%%GDATA + HashKey_2_k], %%T1
+
+        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^3<<1 mod poly
+        vmovdqu  [%%GDATA + HashKey_3], %%T5
+        vpshufd  %%T1, %%T5, 01001110b
+        vpxor    %%T1, %%T5
+        vmovdqu  [%%GDATA + HashKey_3_k], %%T1
+
+        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^4<<1 mod poly
+        vmovdqu  [%%GDATA + HashKey_4], %%T5
+        vpshufd  %%T1, %%T5, 01001110b
+        vpxor    %%T1, %%T5
+        vmovdqu  [%%GDATA + HashKey_4_k], %%T1
+
+        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^5<<1 mod poly
+        vmovdqu  [%%GDATA + HashKey_5], %%T5
+        vpshufd  %%T1, %%T5, 01001110b
+        vpxor    %%T1, %%T5
+        vmovdqu  [%%GDATA + HashKey_5_k], %%T1
+
+        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^6<<1 mod poly
+        vmovdqu  [%%GDATA + HashKey_6], %%T5
+        vpshufd  %%T1, %%T5, 01001110b
+        vpxor    %%T1, %%T5
+        vmovdqu  [%%GDATA + HashKey_6_k], %%T1
+
+        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^7<<1 mod poly
+        vmovdqu  [%%GDATA + HashKey_7], %%T5
+        vpshufd  %%T1, %%T5, 01001110b
+        vpxor    %%T1, %%T5
+        vmovdqu  [%%GDATA + HashKey_7_k], %%T1
+
+        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2      ;  %%T5 = HashKey^8<<1 mod poly
+        vmovdqu  [%%GDATA + HashKey_8], %%T5
+        vpshufd  %%T1, %%T5, 01001110b
+        vpxor    %%T1, %%T5
+        vmovdqu  [%%GDATA + HashKey_8_k], %%T1
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT    6
+%define %%OUTPUT                %1 ; %%OUTPUT is an xmm register
+%define %%INPUT                 %2
+%define %%LENGTH                %3
+%define %%END_READ_LOCATION     %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER               %5
+%define %%TMP1                  %6
+
+        vpxor   %%OUTPUT, %%OUTPUT
+        mov     %%COUNTER, %%LENGTH
+        mov     %%END_READ_LOCATION, %%INPUT
+        add     %%END_READ_LOCATION, %%LENGTH
+        xor     %%TMP1, %%TMP1
+
+
+        cmp     %%COUNTER, 8
+        jl      %%_byte_loop_2
+        vpinsrq %%OUTPUT, [%%INPUT],0           ;Read in 8 bytes if they exists
+        je      %%_done
+
+        sub     %%COUNTER, 8
+
+%%_byte_loop_1:                                 ;Read in data 1 byte at a time while data is left
+        shl     %%TMP1, 8                       ;This loop handles when 8 bytes were already read in
+        dec     %%END_READ_LOCATION
+        mov     BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+        dec     %%COUNTER
+        jg      %%_byte_loop_1
+        vpinsrq %%OUTPUT, %%TMP1, 1
+        jmp     %%_done
+
+%%_byte_loop_2:                                 ;Read in data 1 byte at a time while data is left
+	;; NOTE: in current implementation check for zero length is obsolete here.
+        ;;      The adequate checks are done by callers of this macro.
+        ;; cmp     %%COUNTER, 0
+        ;; je      %%_done
+        shl     %%TMP1, 8                       ;This loop handles when no bytes were already read in
+        dec     %%END_READ_LOCATION
+        mov     BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+        dec     %%COUNTER
+        jg      %%_byte_loop_2
+        vpinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro  CALC_AAD_HASH   14
+%define %%A_IN          %1
+%define %%A_LEN         %2
+%define %%AAD_HASH      %3
+%define %%HASH_KEY      %4
+%define %%XTMP1         %5      ; xmm temp reg 5
+%define %%XTMP2         %6
+%define %%XTMP3         %7
+%define %%XTMP4         %8
+%define %%XTMP5         %9      ; xmm temp reg 5
+%define %%T1            %10     ; temp reg 1
+%define %%T2            %11
+%define %%T3            %12
+%define %%T4            %13
+%define %%T5            %14     ; temp reg 5
+
+
+        mov     %%T1, %%A_IN            ; T1 = AAD
+        mov     %%T2, %%A_LEN           ; T2 = aadLen
+        vpxor   %%AAD_HASH, %%AAD_HASH
+
+        cmp     %%T2, 16
+        jl      %%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+        vmovdqu %%XTMP1, [%%T1]
+        ;byte-reflect the AAD data
+        vpshufb %%XTMP1, [SHUF_MASK]
+        vpxor   %%AAD_HASH, %%XTMP1
+        GHASH_MUL       %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+        sub     %%T2, 16
+        je      %%_CALC_AAD_done
+
+        add     %%T1, 16
+        cmp     %%T2, 16
+        jge     %%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+        READ_SMALL_DATA_INPUT   %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+        ;byte-reflect the AAD data
+        vpshufb %%XTMP1, [SHUF_MASK]
+        vpxor   %%AAD_HASH, %%XTMP1
+        GHASH_MUL       %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK    8
+%define %%GDATA_KEY             %1
+%define %%GDATA_CTX             %2
+%define %%CYPH_PLAIN_OUT        %3
+%define %%PLAIN_CYPH_IN         %4
+%define %%PLAIN_CYPH_LEN        %5
+%define %%DATA_OFFSET           %6
+%define %%AAD_HASH              %7
+%define %%ENC_DEC               %8
+
+        mov     r13, [%%GDATA_CTX + PBlockLen]
+        cmp     r13, 0
+        je      %%_partial_block_done           ;Leave Macro if no partial blocks
+
+        cmp     %%PLAIN_CYPH_LEN, 16            ;Read in input data without over reading
+        jl      %%_fewer_than_16_bytes
+        VXLDR   xmm1, [%%PLAIN_CYPH_IN]         ;If more than 16 bytes of data, just fill the xmm register
+        jmp     %%_data_read
+
+%%_fewer_than_16_bytes:
+        lea     r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+        READ_SMALL_DATA_INPUT   xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+
+%%_data_read:                           ;Finished reading in data
+
+
+        vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey]  ;xmm9 = my_ctx_data.partial_block_enc_key
+        vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+
+        lea     r12, [SHIFT_MASK]
+
+        add     r12, r13                        ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+        vmovdqu xmm2, [r12]                     ; get the appropriate shuffle mask
+        vpshufb xmm9, xmm2                      ;shift right r13 bytes
+
+%ifidn  %%ENC_DEC, DEC
+        vmovdqa xmm3, xmm1
+        vpxor   xmm9, xmm1                      ; Cyphertext XOR E(K, Yn)
+
+        mov     r15, %%PLAIN_CYPH_LEN
+        add     r15, r13
+        sub     r15, 16                         ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+        jge     %%_no_extra_mask_1              ;Determine if if partial block is not being filled and shift mask accordingly
+        sub     r12, r15
+%%_no_extra_mask_1:
+
+        vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK]; get the appropriate mask to mask out bottom r13 bytes of xmm9
+        vpand   xmm9, xmm1                      ; mask out bottom r13 bytes of xmm9
+
+        vpand   xmm3, xmm1
+        vpshufb xmm3, [SHUF_MASK]
+        vpshufb xmm3, xmm2
+        vpxor   %%AAD_HASH, xmm3
+
+
+        cmp     r15,0
+        jl      %%_partial_incomplete_1
+
+        GHASH_MUL       %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6       ;GHASH computation for the last <16 Byte block
+        xor     rax,rax
+        mov     [%%GDATA_CTX + PBlockLen], rax
+        jmp     %%_dec_done
+%%_partial_incomplete_1:
+        add     [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%%_dec_done:
+        vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+%else
+        vpxor   xmm9, xmm1      ; Plaintext XOR E(K, Yn)
+
+        mov     r15, %%PLAIN_CYPH_LEN
+        add     r15, r13
+        sub     r15, 16                         ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+        jge     %%_no_extra_mask_2              ;Determine if if partial block is not being filled and shift mask accordingly
+        sub     r12, r15
+%%_no_extra_mask_2:
+
+        vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK]  ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+        vpand   xmm9, xmm1                      ; mask out bottom r13  bytes of xmm9
+
+        vpshufb xmm9, [SHUF_MASK]
+        vpshufb xmm9, xmm2
+        vpxor   %%AAD_HASH, xmm9
+
+        cmp     r15,0
+        jl      %%_partial_incomplete_2
+
+        GHASH_MUL       %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6       ;GHASH computation for the last <16 Byte block
+        xor     rax,rax
+        mov     [%%GDATA_CTX + PBlockLen], rax
+        jmp     %%_encode_done
+%%_partial_incomplete_2:
+        add     [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%%_encode_done:
+        vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+        vpshufb xmm9, [SHUF_MASK]       ; shuffle xmm9 back to output as ciphertext
+        vpshufb xmm9, xmm2
+%endif
+
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ; output encrypted Bytes
+        cmp     r15,0
+        jl      %%_partial_fill
+        mov     r12, r13
+        mov     r13, 16
+        sub     r13, r12                        ; Set r13 to be the number of bytes to write out
+        jmp     %%_count_set
+%%_partial_fill:
+        mov     r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+        vmovq   rax, xmm9
+        cmp     r13, 8
+        jle     %%_less_than_8_bytes_left
+
+        mov     [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+        add     %%DATA_OFFSET, 8
+        vpsrldq xmm9, xmm9, 8
+        vmovq   rax, xmm9
+        sub     r13, 8
+%%_less_than_8_bytes_left:
+        mov     BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+        add     %%DATA_OFFSET, 1
+        shr     rax, 8
+        sub     r13, 1
+        jne     %%_less_than_8_bytes_left
+         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+%macro GHASH_SINGLE_MUL 9
+%define %%GDATA                 %1
+%define %%HASHKEY               %2
+%define %%CIPHER                %3
+%define %%STATE_11              %4
+%define %%STATE_00              %5
+%define %%STATE_MID             %6
+%define %%T1                    %7
+%define %%T2                    %8
+%define %%FIRST                 %9
+
+        vmovdqu         %%T1, [%%GDATA + %%HASHKEY]
+%ifidn %%FIRST, first
+        vpclmulqdq      %%STATE_11, %%CIPHER, %%T1, 0x11         ; %%T4 = a1*b1
+        vpclmulqdq      %%STATE_00, %%CIPHER, %%T1, 0x00         ; %%T4_2 = a0*b0
+        vpclmulqdq      %%STATE_MID, %%CIPHER, %%T1, 0x01        ; %%T6 = a1*b0
+        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x10               ; %%T5 = a0*b1
+        vpxor           %%STATE_MID, %%STATE_MID, %%T2
+%else
+        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x11
+        vpxor           %%STATE_11, %%STATE_11, %%T2
+
+        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x00
+        vpxor           %%STATE_00, %%STATE_00, %%T2
+
+        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x01
+        vpxor           %%STATE_MID, %%STATE_MID, %%T2
+
+        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x10
+        vpxor           %%STATE_MID, %%STATE_MID, %%T2
+%endif
+
+%endmacro
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 23
+%define %%GDATA_KEY             %1
+%define %%CYPH_PLAIN_OUT        %2
+%define %%PLAIN_CYPH_IN         %3
+%define %%LENGTH                %4
+%define %%DATA_OFFSET           %5
+%define %%num_initial_blocks    %6      ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1                    %7
+%define %%T2                    %8
+%define %%T3                    %9
+%define %%T4                    %10
+%define %%T5                    %11
+%define %%CTR                   %12
+%define %%XMM1                  %13
+%define %%XMM2                  %14
+%define %%XMM3                  %15
+%define %%XMM4                  %16
+%define %%XMM5                  %17
+%define %%XMM6                  %18
+%define %%XMM7                  %19
+%define %%XMM8                  %20
+%define %%T6                    %21
+%define %%T_key                 %22
+%define %%ENC_DEC               %23
+
+%assign i (8-%%num_initial_blocks)
+                ;; Move AAD_HASH to temp reg
+                vmovdqu  %%T2, %%XMM8
+                ;; Start AES for %%num_initial_blocks blocks
+                ;; vmovdqu  %%CTR, [%%GDATA_CTX + CurCount]   ; %%CTR = Y0
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                vpaddd   %%CTR, %%CTR, [ONE]     ; INCR Y0
+                vmovdqa  reg(i), %%CTR
+                vpshufb  reg(i), [SHUF_MASK]     ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+%if(%%num_initial_blocks>0)
+vmovdqu  %%T_key, [%%GDATA_KEY+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                vpxor    reg(i),reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep NROUNDS
+vmovdqu  %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                vaesenc  reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+vmovdqu  %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                vaesenclast      reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%endif ; %if(%%num_initial_blocks>0)
+
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+                vpxor    reg(i), reg(i), %%T1
+                ;; Write back ciphertext for %%num_initial_blocks blocks
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
+                add     %%DATA_OFFSET, 16
+                %ifidn  %%ENC_DEC, DEC
+                    vmovdqa  reg(i), %%T1
+                %endif
+                ;; Prepare ciphertext for GHASH computations
+                vpshufb  reg(i), [SHUF_MASK]
+%assign i (i+1)
+%endrep
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%assign i (9-%%num_initial_blocks)
+%if(%%num_initial_blocks>0)
+        vmovdqa %%T3, reg(i)
+%assign i (i+1)
+
+%rep %%num_initial_blocks-1
+        vmovdqu [rsp + TMP %+ i], reg(i)
+%assign i (i+1)
+%endrep
+%endif
+
+                ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+                ;; Haskey_i_k holds XORed values of the low and high parts of
+                ;; the Haskey_i
+                vpaddd   %%XMM1, %%CTR, [ONE]   ; INCR Y0
+                vpaddd   %%XMM2, %%CTR, [TWO]   ; INCR Y0
+                vpaddd   %%XMM3, %%XMM1, [TWO]  ; INCR Y0
+                vpaddd   %%XMM4, %%XMM2, [TWO]  ; INCR Y0
+                vpaddd   %%XMM5, %%XMM3, [TWO]  ; INCR Y0
+                vpaddd   %%XMM6, %%XMM4, [TWO]  ; INCR Y0
+                vpaddd   %%XMM7, %%XMM5, [TWO]  ; INCR Y0
+                vpaddd   %%XMM8, %%XMM6, [TWO]  ; INCR Y0
+                vmovdqa  %%CTR, %%XMM8
+
+                vpshufb  %%XMM1, [SHUF_MASK]    ; perform a 16Byte swap
+                vpshufb  %%XMM2, [SHUF_MASK]    ; perform a 16Byte swap
+                vpshufb  %%XMM3, [SHUF_MASK]    ; perform a 16Byte swap
+                vpshufb  %%XMM4, [SHUF_MASK]    ; perform a 16Byte swap
+                vpshufb  %%XMM5, [SHUF_MASK]    ; perform a 16Byte swap
+                vpshufb  %%XMM6, [SHUF_MASK]    ; perform a 16Byte swap
+                vpshufb  %%XMM7, [SHUF_MASK]    ; perform a 16Byte swap
+                vpshufb  %%XMM8, [SHUF_MASK]    ; perform a 16Byte swap
+
+                vmovdqu  %%T_key, [%%GDATA_KEY+16*0]
+                vpxor    %%XMM1, %%XMM1, %%T_key
+                vpxor    %%XMM2, %%XMM2, %%T_key
+                vpxor    %%XMM3, %%XMM3, %%T_key
+                vpxor    %%XMM4, %%XMM4, %%T_key
+                vpxor    %%XMM5, %%XMM5, %%T_key
+                vpxor    %%XMM6, %%XMM6, %%T_key
+                vpxor    %%XMM7, %%XMM7, %%T_key
+                vpxor    %%XMM8, %%XMM8, %%T_key
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks)
+
+%define %%T4_2 %%T4
+%if(%%num_initial_blocks>0)
+        ;; Hash in AES state
+        ;; T2 - incoming AAD hash
+        vpxor %%T2, %%T3
+
+        ;;                 GDATA,       HASHKEY, CIPHER,
+        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
+        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, first
+%endif
+
+                vmovdqu  %%T_key, [%%GDATA_KEY+16*1]
+                vaesenc  %%XMM1, %%T_key
+                vaesenc  %%XMM2, %%T_key
+                vaesenc  %%XMM3, %%T_key
+                vaesenc  %%XMM4, %%T_key
+                vaesenc  %%XMM5, %%T_key
+                vaesenc  %%XMM6, %%T_key
+                vaesenc  %%XMM7, %%T_key
+                vaesenc  %%XMM8, %%T_key
+
+                vmovdqu  %%T_key, [%%GDATA_KEY+16*2]
+                vaesenc  %%XMM1, %%T_key
+                vaesenc  %%XMM2, %%T_key
+                vaesenc  %%XMM3, %%T_key
+                vaesenc  %%XMM4, %%T_key
+                vaesenc  %%XMM5, %%T_key
+                vaesenc  %%XMM6, %%T_key
+                vaesenc  %%XMM7, %%T_key
+                vaesenc  %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>1)
+        ;;                 GDATA,       HASHKEY, CIPHER,
+        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
+        vmovdqu         %%T2, [rsp + TMP %+ j]
+        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
+%endif
+
+                vmovdqu  %%T_key, [%%GDATA_KEY+16*3]
+                vaesenc  %%XMM1, %%T_key
+                vaesenc  %%XMM2, %%T_key
+                vaesenc  %%XMM3, %%T_key
+                vaesenc  %%XMM4, %%T_key
+                vaesenc  %%XMM5, %%T_key
+                vaesenc  %%XMM6, %%T_key
+                vaesenc  %%XMM7, %%T_key
+                vaesenc  %%XMM8, %%T_key
+
+                vmovdqu  %%T_key, [%%GDATA_KEY+16*4]
+                vaesenc  %%XMM1, %%T_key
+                vaesenc  %%XMM2, %%T_key
+                vaesenc  %%XMM3, %%T_key
+                vaesenc  %%XMM4, %%T_key
+                vaesenc  %%XMM5, %%T_key
+                vaesenc  %%XMM6, %%T_key
+                vaesenc  %%XMM7, %%T_key
+                vaesenc  %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>2)
+        ;;                 GDATA,       HASHKEY, CIPHER,
+        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
+        vmovdqu         %%T2, [rsp + TMP %+ j]
+        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>3)
+        ;;                 GDATA,       HASHKEY, CIPHER,
+        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
+        vmovdqu         %%T2, [rsp + TMP %+ j]
+        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
+%endif
+
+                vmovdqu  %%T_key, [%%GDATA_KEY+16*5]
+                vaesenc  %%XMM1, %%T_key
+                vaesenc  %%XMM2, %%T_key
+                vaesenc  %%XMM3, %%T_key
+                vaesenc  %%XMM4, %%T_key
+                vaesenc  %%XMM5, %%T_key
+                vaesenc  %%XMM6, %%T_key
+                vaesenc  %%XMM7, %%T_key
+                vaesenc  %%XMM8, %%T_key
+
+                vmovdqu  %%T_key, [%%GDATA_KEY+16*6]
+                vaesenc  %%XMM1, %%T_key
+                vaesenc  %%XMM2, %%T_key
+                vaesenc  %%XMM3, %%T_key
+                vaesenc  %%XMM4, %%T_key
+                vaesenc  %%XMM5, %%T_key
+                vaesenc  %%XMM6, %%T_key
+                vaesenc  %%XMM7, %%T_key
+                vaesenc  %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>4)
+        ;;                 GDATA,       HASHKEY, CIPHER,
+        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
+        vmovdqu         %%T2, [rsp + TMP %+ j]
+        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
+%endif
+
+                vmovdqu  %%T_key, [%%GDATA_KEY+16*7]
+                vaesenc  %%XMM1, %%T_key
+                vaesenc  %%XMM2, %%T_key
+                vaesenc  %%XMM3, %%T_key
+                vaesenc  %%XMM4, %%T_key
+                vaesenc  %%XMM5, %%T_key
+                vaesenc  %%XMM6, %%T_key
+                vaesenc  %%XMM7, %%T_key
+                vaesenc  %%XMM8, %%T_key
+
+                vmovdqu  %%T_key, [%%GDATA_KEY+16*8]
+                vaesenc  %%XMM1, %%T_key
+                vaesenc  %%XMM2, %%T_key
+                vaesenc  %%XMM3, %%T_key
+                vaesenc  %%XMM4, %%T_key
+                vaesenc  %%XMM5, %%T_key
+                vaesenc  %%XMM6, %%T_key
+                vaesenc  %%XMM7, %%T_key
+                vaesenc  %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>5)
+        ;;                 GDATA,       HASHKEY, CIPHER,
+        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
+        vmovdqu         %%T2, [rsp + TMP %+ j]
+        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
+%endif
+
+                vmovdqu  %%T_key, [%%GDATA_KEY+16*9]
+                vaesenc  %%XMM1, %%T_key
+                vaesenc  %%XMM2, %%T_key
+                vaesenc  %%XMM3, %%T_key
+                vaesenc  %%XMM4, %%T_key
+                vaesenc  %%XMM5, %%T_key
+                vaesenc  %%XMM6, %%T_key
+                vaesenc  %%XMM7, %%T_key
+                vaesenc  %%XMM8, %%T_key
+
+%ifndef GCM128_MODE
+                vmovdqu  %%T_key, [%%GDATA_KEY+16*10]
+                vaesenc  %%XMM1, %%T_key
+                vaesenc  %%XMM2, %%T_key
+                vaesenc  %%XMM3, %%T_key
+                vaesenc  %%XMM4, %%T_key
+                vaesenc  %%XMM5, %%T_key
+                vaesenc  %%XMM6, %%T_key
+                vaesenc  %%XMM7, %%T_key
+                vaesenc  %%XMM8, %%T_key
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>6)
+        ;;                 GDATA,       HASHKEY, CIPHER,
+        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
+        vmovdqu         %%T2, [rsp + TMP %+ j]
+        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
+%endif
+
+%ifdef GCM128_MODE
+                vmovdqu  %%T_key, [%%GDATA_KEY+16*10]
+                vaesenclast  %%XMM1, %%T_key
+                vaesenclast  %%XMM2, %%T_key
+                vaesenclast  %%XMM3, %%T_key
+                vaesenclast  %%XMM4, %%T_key
+                vaesenclast  %%XMM5, %%T_key
+                vaesenclast  %%XMM6, %%T_key
+                vaesenclast  %%XMM7, %%T_key
+                vaesenclast  %%XMM8, %%T_key
+%endif
+
+%ifdef GCM192_MODE
+                vmovdqu  %%T_key, [%%GDATA_KEY+16*11]
+                vaesenc  %%XMM1, %%T_key
+                vaesenc  %%XMM2, %%T_key
+                vaesenc  %%XMM3, %%T_key
+                vaesenc  %%XMM4, %%T_key
+                vaesenc  %%XMM5, %%T_key
+                vaesenc  %%XMM6, %%T_key
+                vaesenc  %%XMM7, %%T_key
+                vaesenc  %%XMM8, %%T_key
+
+                vmovdqu          %%T_key, [%%GDATA_KEY+16*12]
+                vaesenclast      %%XMM1, %%T_key
+                vaesenclast      %%XMM2, %%T_key
+                vaesenclast      %%XMM3, %%T_key
+                vaesenclast      %%XMM4, %%T_key
+                vaesenclast      %%XMM5, %%T_key
+                vaesenclast      %%XMM6, %%T_key
+                vaesenclast      %%XMM7, %%T_key
+                vaesenclast      %%XMM8, %%T_key
+%endif
+%ifdef GCM256_MODE
+                vmovdqu  %%T_key, [%%GDATA_KEY+16*11]
+                vaesenc  %%XMM1, %%T_key
+                vaesenc  %%XMM2, %%T_key
+                vaesenc  %%XMM3, %%T_key
+                vaesenc  %%XMM4, %%T_key
+                vaesenc  %%XMM5, %%T_key
+                vaesenc  %%XMM6, %%T_key
+                vaesenc  %%XMM7, %%T_key
+                vaesenc  %%XMM8, %%T_key
+
+                vmovdqu          %%T_key, [%%GDATA_KEY+16*12]
+                vaesenc  %%XMM1, %%T_key
+                vaesenc  %%XMM2, %%T_key
+                vaesenc  %%XMM3, %%T_key
+                vaesenc  %%XMM4, %%T_key
+                vaesenc  %%XMM5, %%T_key
+                vaesenc  %%XMM6, %%T_key
+                vaesenc  %%XMM7, %%T_key
+                vaesenc  %%XMM8, %%T_key
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>7)
+        ;;                 GDATA,       HASHKEY, CIPHER,
+        ;;               STATE_11, STATE_00, STATE_MID, T1, T2
+        vmovdqu         %%T2, [rsp + TMP %+ j]
+        GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+                         %%T1,     %%T4,   %%T6,    %%T5, %%T3, not_first
+%endif
+
+%ifdef GCM256_MODE             ; GCM256
+                vmovdqu  %%T_key, [%%GDATA_KEY+16*13]
+                vaesenc  %%XMM1, %%T_key
+                vaesenc  %%XMM2, %%T_key
+                vaesenc  %%XMM3, %%T_key
+                vaesenc  %%XMM4, %%T_key
+                vaesenc  %%XMM5, %%T_key
+                vaesenc  %%XMM6, %%T_key
+                vaesenc  %%XMM7, %%T_key
+                vaesenc  %%XMM8, %%T_key
+
+                vmovdqu          %%T_key, [%%GDATA_KEY+16*14]
+                vaesenclast      %%XMM1, %%T_key
+                vaesenclast      %%XMM2, %%T_key
+                vaesenclast      %%XMM3, %%T_key
+                vaesenclast      %%XMM4, %%T_key
+                vaesenclast      %%XMM5, %%T_key
+                vaesenclast      %%XMM6, %%T_key
+                vaesenclast      %%XMM7, %%T_key
+                vaesenclast      %%XMM8, %%T_key
+%endif                          ;  GCM256 mode
+
+%if(%%num_initial_blocks>0)
+        vpsrldq %%T3, %%T6, 8            ; shift-R %%T2 2 DWs
+        vpslldq %%T6, %%T6, 8            ; shift-L %%T3 2 DWs
+        vpxor   %%T1, %%T1, %%T3         ; accumulate the results in %%T1:%%T4
+        vpxor   %%T4, %%T6, %%T4
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ; First phase of the reduction
+        vmovdqu         %%T3, [POLY2]
+
+        vpclmulqdq      %%T2, %%T3, %%T4, 0x01
+        vpslldq         %%T2, %%T2, 8             ; shift-L xmm2 2 DWs
+
+        ;; First phase of the reduction complete
+        vpxor           %%T4, %%T4, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ; Second phase of the reduction
+        vpclmulqdq      %%T2, %%T3, %%T4, 0x00
+        ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+        vpsrldq         %%T2, %%T2, 4
+
+        vpclmulqdq      %%T4, %%T3, %%T4, 0x10
+        ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+        vpslldq         %%T4, %%T4, 4
+        ;; Second phase of the reduction complete
+        vpxor           %%T4, %%T4, %%T2
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ; The result is in %%T3
+        vpxor           %%T3, %%T1, %%T4
+%else
+        ;; The hash should end up in T3
+        vmovdqa  %%T3, %%T2
+%endif
+
+        ;; Final hash is now in T3
+%if %%num_initial_blocks > 0
+        ;; NOTE: obsolete in case %%num_initial_blocks = 0
+        sub     %%LENGTH, 16*%%num_initial_blocks
+%endif
+
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+                vpxor    %%XMM1, %%XMM1, %%T1
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+                %ifidn  %%ENC_DEC, DEC
+                vmovdqa  %%XMM1, %%T1
+                %endif
+
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+                vpxor    %%XMM2, %%XMM2, %%T1
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+                %ifidn  %%ENC_DEC, DEC
+                vmovdqa  %%XMM2, %%T1
+                %endif
+
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+                vpxor    %%XMM3, %%XMM3, %%T1
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+                %ifidn  %%ENC_DEC, DEC
+                vmovdqa  %%XMM3, %%T1
+                %endif
+
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+                vpxor    %%XMM4, %%XMM4, %%T1
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+                %ifidn  %%ENC_DEC, DEC
+                vmovdqa  %%XMM4, %%T1
+                %endif
+
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+                vpxor    %%XMM5, %%XMM5, %%T1
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+                %ifidn  %%ENC_DEC, DEC
+                vmovdqa  %%XMM5, %%T1
+                %endif
+
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+                vpxor    %%XMM6, %%XMM6, %%T1
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+                %ifidn  %%ENC_DEC, DEC
+                vmovdqa  %%XMM6, %%T1
+                %endif
+
+               VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+                vpxor    %%XMM7, %%XMM7, %%T1
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+                %ifidn  %%ENC_DEC, DEC
+                vmovdqa  %%XMM7, %%T1
+                %endif
+
+%if %%num_initial_blocks > 0
+                ;; NOTE: 'jl' is never taken for %%num_initial_blocks = 0
+                ;;      This macro is executed for lenght 128 and up,
+                ;;      zero length is checked in GCM_ENC_DEC.
+                ;; If the last block is partial then the xor will be done later
+                ;; in ENCRYPT_FINAL_PARTIAL_BLOCK.
+                ;; We know it's partial if LENGTH - 16*num_initial_blocks < 128
+                cmp %%LENGTH, 128
+                jl %%_initial_skip_last_word_write
+%endif
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+                vpxor    %%XMM8, %%XMM8, %%T1
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+                %ifidn  %%ENC_DEC, DEC
+                vmovdqa  %%XMM8, %%T1
+                %endif
+
+                ;; Update %%LENGTH with the number of blocks processed
+                sub     %%LENGTH, 16
+                add     %%DATA_OFFSET, 16
+%%_initial_skip_last_word_write:
+                sub     %%LENGTH, 128-16
+                add     %%DATA_OFFSET, 128-16
+
+                vpshufb  %%XMM1, [SHUF_MASK]             ; perform a 16Byte swap
+                ;; Combine GHASHed value with the corresponding ciphertext
+                vpxor    %%XMM1, %%XMM1, %%T3
+                vpshufb  %%XMM2, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb  %%XMM3, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb  %%XMM4, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb  %%XMM5, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb  %%XMM6, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb  %%XMM7, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb  %%XMM8, [SHUF_MASK]             ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+;;; INITIAL_BLOCKS macro with support for a partial final block.
+;;; num_initial_blocks is expected to include the partial final block
+;;;     in the count.
+%macro INITIAL_BLOCKS_PARTIAL 25
+%define %%GDATA_KEY             %1
+%define %%GDATA_CTX             %2
+%define %%CYPH_PLAIN_OUT        %3
+%define %%PLAIN_CYPH_IN         %4
+%define %%LENGTH                %5
+%define %%DATA_OFFSET           %6
+%define %%num_initial_blocks    %7  ; can be 1, 2, 3, 4, 5, 6 or 7 (not 0)
+%define %%T1                    %8
+%define %%T2                    %9
+%define %%T3                    %10
+%define %%T4                    %11
+%define %%T5                    %12
+%define %%CTR                   %13
+%define %%XMM1                  %14
+%define %%XMM2                  %15
+%define %%XMM3                  %16
+%define %%XMM4                  %17
+%define %%XMM5                  %18
+%define %%XMM6                  %19
+%define %%XMM7                  %20
+%define %%XMM8                  %21
+%define %%T6                    %22
+%define %%T_key                 %23
+%define %%ENC_DEC               %24
+%define %%INSTANCE_TYPE         %25
+
+%assign i (8-%%num_initial_blocks)
+                ;; Move AAD_HASH to temp reg
+                vmovdqu  %%T2, %%XMM8
+                ;; vmovdqu  %%CTR, [%%GDATA_CTX + CurCount]  ; %%CTR = Y0
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                ;; Compute AES counters
+                vpaddd   %%CTR, %%CTR, [rel ONE]     ; INCR Y0
+                vmovdqa  reg(i), %%CTR
+                vpshufb  reg(i), [rel SHUF_MASK]     ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+vmovdqu  %%T_key, [%%GDATA_KEY+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                ; Start AES for %%num_initial_blocks blocks
+                vpxor    reg(i),reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep NROUNDS
+vmovdqu  %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                vaesenc  reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+vmovdqu  %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                vaesenclast      reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Hash all but the last block of data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%assign i (9-%%num_initial_blocks)
+%if %%num_initial_blocks > 0
+%rep %%num_initial_blocks-1
+                ;; Encrypt the message for all but the last block
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+                vpxor    reg(i), reg(i), %%T1
+                ;; write back ciphertext for %%num_initial_blocks blocks
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
+                add     %%DATA_OFFSET, 16
+                %ifidn  %%ENC_DEC, DEC
+                    vmovdqa  reg(i), %%T1
+                %endif
+                ;; Prepare ciphertext for GHASH computations
+                vpshufb  reg(i), [rel SHUF_MASK]
+%assign i (i+1)
+%endrep
+%endif
+                ;; The final block of data may be <16B
+                sub      %%LENGTH, 16*(%%num_initial_blocks-1)
+
+%if %%num_initial_blocks < 8
+                ;; NOTE: the 'jl' is always taken for num_initial_blocks = 8.
+                ;;      This is run in the context of GCM_ENC_DEC_SMALL for length < 128.
+                cmp      %%LENGTH, 16
+                jl       %%_small_initial_partial_block
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Handle a full length final block - encrypt and hash all blocks
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+                sub      %%LENGTH, 16
+	        mov	[%%GDATA_CTX + PBlockLen], %%LENGTH
+
+                ;; Encrypt the message
+                VXLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+                vpxor    reg(i), reg(i), %%T1
+                ;; write back ciphertext for %%num_initial_blocks blocks
+                VXSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
+                add     %%DATA_OFFSET, 16
+                %ifidn  %%ENC_DEC, DEC
+                    vmovdqa  reg(i), %%T1
+                %endif
+                ;; Prepare ciphertext for GHASH computations
+                vpshufb  reg(i), [rel SHUF_MASK]
+
+        ;; Hash all of the data
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks)
+%assign last_block_to_hash 0
+
+%if(%%num_initial_blocks>last_block_to_hash)
+        ;; Hash in AES state
+        vpxor %%T2, reg(j)
+
+        ;; T2 - incoming AAD hash
+        ;; reg(i) holds ciphertext
+        ;; T5 - hash key
+        ;; T6 - updated xor
+        ;; reg(1)/xmm1 should now be available for tmp use
+        vmovdqu         %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+        vpclmulqdq      %%T1, %%T2, %%T5, 0x11             ; %%T4 = a1*b1
+        vpclmulqdq      %%T4, %%T2, %%T5, 0x00             ; %%T4 = a0*b0
+        vpclmulqdq      %%T6, %%T2, %%T5, 0x01             ; %%T6 = a1*b0
+        vpclmulqdq      %%T5, %%T2, %%T5, 0x10             ; %%T5 = a0*b1
+        vpxor           %%T6, %%T6, %%T5
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%assign rep_count (%%num_initial_blocks-1)
+%if rep_count > 0
+%rep rep_count
+
+        vmovdqu         %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+        vpclmulqdq      %%T3, reg(j), %%T5, 0x11
+        vpxor           %%T1, %%T1, %%T3
+
+        vpclmulqdq      %%T3, reg(j), %%T5, 0x00
+        vpxor           %%T4, %%T4, %%T3
+
+        vpclmulqdq      %%T3, reg(j), %%T5, 0x01
+        vpxor           %%T6, %%T6, %%T3
+
+        vpclmulqdq      %%T3, reg(j), %%T5, 0x10
+        vpxor           %%T6, %%T6, %%T3
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%endrep
+%endif
+        ;; Record that a reduction is needed
+        mov            r12, 1
+
+        jmp      %%_small_initial_compute_hash
+
+
+%endif                          ; %if %%num_initial_blocks < 8
+
+%%_small_initial_partial_block:
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Handle ghash for a <16B final block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+        ;; In this case if it's a single call to encrypt we can
+        ;; hash all of the data but if it's an init / update / finalize
+        ;; series of call we need to leave the last block if it's
+        ;; less than a full block of data.
+
+	mov	[%%GDATA_CTX + PBlockLen], %%LENGTH
+        vmovdqu [%%GDATA_CTX + PBlockEncKey], reg(i)
+        ;; Handle a partial final block
+        ;;                            GDATA,    KEY,   T1,   T2
+        ;; r13 - length
+        ;; LT16 - indicates type of read and that the buffer is less than 16 bytes long
+        ;;      NOTE: could be replaced with %%LENGTH but at this point
+        ;;      %%LENGTH is always less than 16.
+        ;;      No PLAIN_CYPH_LEN argument available in this macro.
+        ENCRYPT_FINAL_PARTIAL_BLOCK reg(i), %%T1, %%T3, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, LT16, %%ENC_DEC, %%DATA_OFFSET
+        vpshufb  reg(i), [SHUF_MASK]
+
+%ifidn %%INSTANCE_TYPE, multi_call
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks-1)
+%assign last_block_to_hash 1
+%else
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks)
+%assign last_block_to_hash 0
+%endif
+
+%if(%%num_initial_blocks>last_block_to_hash)
+        ;; Record that a reduction is needed
+        mov            r12, 1
+        ;; Hash in AES state
+        vpxor          %%T2, reg(j)
+
+        ;; T2 - incoming AAD hash
+        ;; reg(i) holds ciphertext
+        ;; T5 - hash key
+        ;; T6 - updated xor
+        ;; reg(1)/xmm1 should now be available for tmp use
+        vmovdqu         %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+        vpclmulqdq      %%T1, %%T2, %%T5, 0x11             ; %%T4 = a1*b1
+        vpclmulqdq      %%T4, %%T2, %%T5, 0x00             ; %%T4 = a0*b0
+        vpclmulqdq      %%T6, %%T2, %%T5, 0x01             ; %%T6 = a1*b0
+        vpclmulqdq      %%T5, %%T2, %%T5, 0x10             ; %%T5 = a0*b1
+        vpxor           %%T6, %%T6, %%T5
+%else
+        ;; Record that a reduction is not needed -
+        ;; In this case no hashes are computed because there
+        ;; is only one initial block and it is < 16B in length.
+        mov            r12, 0
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%ifidn %%INSTANCE_TYPE, multi_call
+%assign rep_count (%%num_initial_blocks-2)
+%%_multi_call_hash:
+%else
+%assign rep_count (%%num_initial_blocks-1)
+%endif
+%if rep_count > 0
+%rep rep_count
+
+        vmovdqu         %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+        vpclmulqdq      %%T3, reg(j), %%T5, 0x11
+        vpxor           %%T1, %%T1, %%T3
+
+        vpclmulqdq      %%T3, reg(j), %%T5, 0x00
+        vpxor           %%T4, %%T4, %%T3
+
+        vpclmulqdq      %%T3, reg(j), %%T5, 0x01
+        vpxor           %%T6, %%T6, %%T3
+
+        vpclmulqdq      %%T3, reg(j), %%T5, 0x10
+        vpxor           %%T6, %%T6, %%T3
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%endrep
+%endif
+
+%%_small_initial_compute_hash:
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Ghash reduction
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%if(%%num_initial_blocks=1)
+%ifidn %%INSTANCE_TYPE, multi_call
+        ;; We only need to check if a reduction is needed if
+        ;; initial_blocks == 1 and init/update/final is being used.
+        ;; In this case we may just have a partial block, and that
+        ;; gets hashed in finalize.
+        cmp     r12, 0
+        je      %%_no_reduction_needed
+%endif
+%endif
+
+        vpsrldq %%T3, %%T6, 8          ; shift-R %%T2 2 DWs
+        vpslldq %%T6, %%T6, 8          ; shift-L %%T3 2 DWs
+        vpxor   %%T1, %%T1, %%T3       ; accumulate the results in %%T1:%%T4
+        vpxor   %%T4, %%T6, %%T4
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; First phase of the reduction
+        vmovdqu         %%T3, [POLY2]
+
+        vpclmulqdq      %%T2, %%T3, %%T4, 0x01
+        ;; shift-L xmm2 2 DWs
+        vpslldq         %%T2, %%T2, 8
+        vpxor           %%T4, %%T4, %%T2
+
+        ;; First phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; Second phase of the reduction
+
+        vpclmulqdq      %%T2, %%T3, %%T4, 0x00
+        ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+        vpsrldq         %%T2, %%T2, 4
+
+        vpclmulqdq      %%T4, %%T3, %%T4, 0x10
+        ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+        vpslldq         %%T4, %%T4, 4
+
+        vpxor           %%T4, %%T4, %%T2
+        ;; Second phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        vpxor           %%T3, %%T1, %%T4
+
+%ifidn %%INSTANCE_TYPE, multi_call
+        ;; If using init/update/finalize, we need to xor any partial block data
+        ;; into the hash.
+%if %%num_initial_blocks > 1
+        ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place
+%if %%num_initial_blocks != 8
+        ;; NOTE: for %%num_initial_blocks = 8, %%LENGTH, stored in [PBlockLen] is never zero
+        cmp             qword [%%GDATA_CTX + PBlockLen], 0
+        je              %%_no_partial_block_xor
+%endif                          ; %%num_initial_blocks != 8
+        vpxor           %%T3, %%T3, reg(8)
+%%_no_partial_block_xor:
+%endif                          ; %%num_initial_blocks > 1
+%endif                          ; %%INSTANCE_TYPE, multi_call
+
+%if(%%num_initial_blocks=1)
+%ifidn %%INSTANCE_TYPE, multi_call
+        ;; NOTE: %%_no_reduction_needed case only valid for
+        ;;      multi_call with initial_blocks = 1.
+        ;; Look for comment above around '_no_reduction_needed'
+        ;; The jmp below is obsolete as the code will fall through.
+
+        ;; The result is in %%T3
+        jmp             %%_after_reduction
+
+%%_no_reduction_needed:
+        ;; The hash should end up in T3. The only way we should get here is if
+        ;; there is a partial block of data, so xor that into the hash.
+        vpxor            %%T3, %%T2, reg(8)
+%endif                          ; %%INSTANCE_TYPE = multi_call
+%endif                          ; %%num_initial_blocks=1
+
+%%_after_reduction:
+        ;; Final hash is now in T3
+
+%endmacro                       ; INITIAL_BLOCKS_PARTIAL
+
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; %%DATA_OFFSET is the data offset value
+%macro  GHASH_8_ENCRYPT_8_PARALLEL 23
+%define %%GDATA                 %1
+%define %%CYPH_PLAIN_OUT        %2
+%define %%PLAIN_CYPH_IN         %3
+%define %%DATA_OFFSET           %4
+%define %%T1    %5
+%define %%T2    %6
+%define %%T3    %7
+%define %%T4    %8
+%define %%T5    %9
+%define %%T6    %10
+%define %%CTR   %11
+%define %%XMM1  %12
+%define %%XMM2  %13
+%define %%XMM3  %14
+%define %%XMM4  %15
+%define %%XMM5  %16
+%define %%XMM6  %17
+%define %%XMM7  %18
+%define %%XMM8  %19
+%define %%T7    %20
+%define %%loop_idx      %21
+%define %%ENC_DEC       %22
+%define %%FULL_PARTIAL  %23
+
+        vmovdqa %%T2, %%XMM1
+        vmovdqu [rsp + TMP2], %%XMM2
+        vmovdqu [rsp + TMP3], %%XMM3
+        vmovdqu [rsp + TMP4], %%XMM4
+        vmovdqu [rsp + TMP5], %%XMM5
+        vmovdqu [rsp + TMP6], %%XMM6
+        vmovdqu [rsp + TMP7], %%XMM7
+        vmovdqu [rsp + TMP8], %%XMM8
+
+%ifidn %%loop_idx, in_order
+                vpaddd  %%XMM1, %%CTR,  [ONE]           ; INCR CNT
+                vmovdqu %%T5, [TWO]
+                vpaddd  %%XMM2, %%CTR, %%T5
+                vpaddd  %%XMM3, %%XMM1, %%T5
+                vpaddd  %%XMM4, %%XMM2, %%T5
+                vpaddd  %%XMM5, %%XMM3, %%T5
+                vpaddd  %%XMM6, %%XMM4, %%T5
+                vpaddd  %%XMM7, %%XMM5, %%T5
+                vpaddd  %%XMM8, %%XMM6, %%T5
+                vmovdqa %%CTR, %%XMM8
+
+                vmovdqu %%T5, [SHUF_MASK]
+                vpshufb %%XMM1, %%T5             ; perform a 16Byte swap
+                vpshufb %%XMM2, %%T5             ; perform a 16Byte swap
+                vpshufb %%XMM3, %%T5             ; perform a 16Byte swap
+                vpshufb %%XMM4, %%T5             ; perform a 16Byte swap
+                vpshufb %%XMM5, %%T5             ; perform a 16Byte swap
+                vpshufb %%XMM6, %%T5             ; perform a 16Byte swap
+                vpshufb %%XMM7, %%T5             ; perform a 16Byte swap
+                vpshufb %%XMM8, %%T5             ; perform a 16Byte swap
+%else
+                vpaddd  %%XMM1, %%CTR,  [ONEf]          ; INCR CNT
+                vmovdqu %%T5, [TWOf]
+                vpaddd  %%XMM2, %%CTR,  %%T5
+                vpaddd  %%XMM3, %%XMM1, %%T5
+                vpaddd  %%XMM4, %%XMM2, %%T5
+                vpaddd  %%XMM5, %%XMM3, %%T5
+                vpaddd  %%XMM6, %%XMM4, %%T5
+                vpaddd  %%XMM7, %%XMM5, %%T5
+                vpaddd  %%XMM8, %%XMM6, %%T5
+                vmovdqa %%CTR, %%XMM8
+%endif
+
+
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+                vmovdqu %%T1, [%%GDATA + 16*0]
+                vpxor   %%XMM1, %%XMM1, %%T1
+                vpxor   %%XMM2, %%XMM2, %%T1
+                vpxor   %%XMM3, %%XMM3, %%T1
+                vpxor   %%XMM4, %%XMM4, %%T1
+                vpxor   %%XMM5, %%XMM5, %%T1
+                vpxor   %%XMM6, %%XMM6, %%T1
+                vpxor   %%XMM7, %%XMM7, %%T1
+                vpxor   %%XMM8, %%XMM8, %%T1
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+                vmovdqu %%T1, [%%GDATA + 16*1]
+                vaesenc %%XMM1, %%T1
+                vaesenc %%XMM2, %%T1
+                vaesenc %%XMM3, %%T1
+                vaesenc %%XMM4, %%T1
+                vaesenc %%XMM5, %%T1
+                vaesenc %%XMM6, %%T1
+                vaesenc %%XMM7, %%T1
+                vaesenc %%XMM8, %%T1
+
+
+                vmovdqu %%T1, [%%GDATA + 16*2]
+                vaesenc %%XMM1, %%T1
+                vaesenc %%XMM2, %%T1
+                vaesenc %%XMM3, %%T1
+                vaesenc %%XMM4, %%T1
+                vaesenc %%XMM5, %%T1
+                vaesenc %%XMM6, %%T1
+                vaesenc %%XMM7, %%T1
+                vaesenc %%XMM8, %%T1
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+        vmovdqu         %%T5, [%%GDATA + HashKey_8]
+        vpclmulqdq      %%T4, %%T2, %%T5, 0x11                  ; %%T4 = a1*b1
+        vpclmulqdq      %%T7, %%T2, %%T5, 0x00                  ; %%T7 = a0*b0
+        vpclmulqdq      %%T6, %%T2, %%T5, 0x01                  ; %%T6 = a1*b0
+        vpclmulqdq      %%T5, %%T2, %%T5, 0x10                  ; %%T5 = a0*b1
+        vpxor           %%T6, %%T6, %%T5
+
+                vmovdqu %%T1, [%%GDATA + 16*3]
+                vaesenc %%XMM1, %%T1
+                vaesenc %%XMM2, %%T1
+                vaesenc %%XMM3, %%T1
+                vaesenc %%XMM4, %%T1
+                vaesenc %%XMM5, %%T1
+                vaesenc %%XMM6, %%T1
+                vaesenc %%XMM7, %%T1
+                vaesenc %%XMM8, %%T1
+
+        vmovdqu         %%T1, [rsp + TMP2]
+        vmovdqu         %%T5, [%%GDATA + HashKey_7]
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
+        vpxor           %%T4, %%T4, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
+        vpxor           %%T6, %%T6, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
+        vpxor           %%T6, %%T6, %%T3
+
+                vmovdqu %%T1, [%%GDATA + 16*4]
+                vaesenc %%XMM1, %%T1
+                vaesenc %%XMM2, %%T1
+                vaesenc %%XMM3, %%T1
+                vaesenc %%XMM4, %%T1
+                vaesenc %%XMM5, %%T1
+                vaesenc %%XMM6, %%T1
+                vaesenc %%XMM7, %%T1
+                vaesenc %%XMM8, %%T1
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        vmovdqu         %%T1, [rsp + TMP3]
+        vmovdqu         %%T5, [%%GDATA + HashKey_6]
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
+        vpxor           %%T4, %%T4, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
+        vpxor           %%T6, %%T6, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
+        vpxor           %%T6, %%T6, %%T3
+
+                vmovdqu %%T1, [%%GDATA + 16*5]
+                vaesenc %%XMM1, %%T1
+                vaesenc %%XMM2, %%T1
+                vaesenc %%XMM3, %%T1
+                vaesenc %%XMM4, %%T1
+                vaesenc %%XMM5, %%T1
+                vaesenc %%XMM6, %%T1
+                vaesenc %%XMM7, %%T1
+                vaesenc %%XMM8, %%T1
+
+
+        vmovdqu         %%T1, [rsp + TMP4]
+        vmovdqu         %%T5, [%%GDATA + HashKey_5]
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
+        vpxor           %%T4, %%T4, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
+        vpxor           %%T6, %%T6, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
+        vpxor           %%T6, %%T6, %%T3
+
+                vmovdqu %%T1, [%%GDATA + 16*6]
+                vaesenc %%XMM1, %%T1
+                vaesenc %%XMM2, %%T1
+                vaesenc %%XMM3, %%T1
+                vaesenc %%XMM4, %%T1
+                vaesenc %%XMM5, %%T1
+                vaesenc %%XMM6, %%T1
+                vaesenc %%XMM7, %%T1
+                vaesenc %%XMM8, %%T1
+
+        vmovdqu         %%T1, [rsp + TMP5]
+        vmovdqu         %%T5, [%%GDATA + HashKey_4]
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
+        vpxor           %%T4, %%T4, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
+        vpxor           %%T6, %%T6, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
+        vpxor           %%T6, %%T6, %%T3
+
+                vmovdqu %%T1, [%%GDATA + 16*7]
+                vaesenc %%XMM1, %%T1
+                vaesenc %%XMM2, %%T1
+                vaesenc %%XMM3, %%T1
+                vaesenc %%XMM4, %%T1
+                vaesenc %%XMM5, %%T1
+                vaesenc %%XMM6, %%T1
+                vaesenc %%XMM7, %%T1
+                vaesenc %%XMM8, %%T1
+
+        vmovdqu         %%T1, [rsp + TMP6]
+        vmovdqu         %%T5, [%%GDATA + HashKey_3]
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
+        vpxor           %%T4, %%T4, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
+        vpxor           %%T6, %%T6, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
+        vpxor           %%T6, %%T6, %%T3
+
+                vmovdqu %%T1, [%%GDATA + 16*8]
+                vaesenc %%XMM1, %%T1
+                vaesenc %%XMM2, %%T1
+                vaesenc %%XMM3, %%T1
+                vaesenc %%XMM4, %%T1
+                vaesenc %%XMM5, %%T1
+                vaesenc %%XMM6, %%T1
+                vaesenc %%XMM7, %%T1
+                vaesenc %%XMM8, %%T1
+
+        vmovdqu         %%T1, [rsp + TMP7]
+        vmovdqu         %%T5, [%%GDATA + HashKey_2]
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
+        vpxor           %%T4, %%T4, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
+        vpxor           %%T6, %%T6, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
+        vpxor           %%T6, %%T6, %%T3
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+                vmovdqu %%T5, [%%GDATA + 16*9]
+                vaesenc %%XMM1, %%T5
+                vaesenc %%XMM2, %%T5
+                vaesenc %%XMM3, %%T5
+                vaesenc %%XMM4, %%T5
+                vaesenc %%XMM5, %%T5
+                vaesenc %%XMM6, %%T5
+                vaesenc %%XMM7, %%T5
+                vaesenc %%XMM8, %%T5
+
+        vmovdqu         %%T1, [rsp + TMP8]
+        vmovdqu         %%T5, [%%GDATA + HashKey]
+
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x01
+        vpxor           %%T6, %%T6, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x10
+        vpxor           %%T6, %%T6, %%T3
+
+        vpclmulqdq      %%T3, %%T1, %%T5, 0x11
+        vpxor           %%T1, %%T4, %%T3
+
+
+                vmovdqu %%T5, [%%GDATA + 16*10]
+ %ifndef GCM128_MODE            ; GCM192 or GCM256
+                vaesenc %%XMM1, %%T5
+                vaesenc %%XMM2, %%T5
+                vaesenc %%XMM3, %%T5
+                vaesenc %%XMM4, %%T5
+                vaesenc %%XMM5, %%T5
+                vaesenc %%XMM6, %%T5
+                vaesenc %%XMM7, %%T5
+                vaesenc %%XMM8, %%T5
+
+                vmovdqu %%T5, [%%GDATA + 16*11]
+                vaesenc %%XMM1, %%T5
+                vaesenc %%XMM2, %%T5
+                vaesenc %%XMM3, %%T5
+                vaesenc %%XMM4, %%T5
+                vaesenc %%XMM5, %%T5
+                vaesenc %%XMM6, %%T5
+                vaesenc %%XMM7, %%T5
+                vaesenc %%XMM8, %%T5
+
+                vmovdqu %%T5, [%%GDATA + 16*12]
+%endif
+%ifdef GCM256_MODE
+                vaesenc %%XMM1, %%T5
+                vaesenc %%XMM2, %%T5
+                vaesenc %%XMM3, %%T5
+                vaesenc %%XMM4, %%T5
+                vaesenc %%XMM5, %%T5
+                vaesenc %%XMM6, %%T5
+                vaesenc %%XMM7, %%T5
+                vaesenc %%XMM8, %%T5
+
+                vmovdqu %%T5, [%%GDATA + 16*13]
+                vaesenc %%XMM1, %%T5
+                vaesenc %%XMM2, %%T5
+                vaesenc %%XMM3, %%T5
+                vaesenc %%XMM4, %%T5
+                vaesenc %%XMM5, %%T5
+                vaesenc %%XMM6, %%T5
+                vaesenc %%XMM7, %%T5
+                vaesenc %%XMM8, %%T5
+
+                vmovdqu %%T5, [%%GDATA + 16*14]
+%endif                          ; GCM256
+
+%assign i 0
+%assign j 1
+%rep 8
+
+        ;; SNP TBD: This is pretty ugly - consider whether just XORing the
+        ;; data in after vaesenclast is simpler and performant. Would
+        ;; also have to ripple it through partial block and ghash_mul_8.
+%ifidn %%FULL_PARTIAL, full
+    %ifdef  NT_LD
+        VXLDR   %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+        vpxor   %%T2, %%T2, %%T5
+    %else
+        vpxor   %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+    %endif
+
+    %ifidn %%ENC_DEC, ENC
+        vaesenclast     reg(j), reg(j), %%T2
+    %else
+        vaesenclast     %%T3, reg(j), %%T2
+        vpxor   reg(j), %%T2, %%T5
+        VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
+    %endif
+
+%else
+    ; Don't read the final data during partial block processing
+    %ifdef  NT_LD
+        %if (i<7)
+            VXLDR   %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+            vpxor   %%T2, %%T2, %%T5
+        %else
+            ;; Stage the key directly in T2 rather than hash it with plaintext
+            vmovdqu %%T2, %%T5
+        %endif
+    %else
+        %if (i<7)
+            vpxor   %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+        %else
+            ;; Stage the key directly in T2 rather than hash it with plaintext
+            vmovdqu %%T2, %%T5
+        %endif
+    %endif
+
+    %ifidn %%ENC_DEC, ENC
+        vaesenclast     reg(j), reg(j), %%T2
+    %else
+        %if (i<7)
+            vaesenclast     %%T3, reg(j), %%T2
+            vpxor   reg(j), %%T2, %%T5
+            ;; Do not read the data since it could fault
+            VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
+        %else
+            vaesenclast     reg(j), reg(j), %%T2
+        %endif
+    %endif
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+        vpslldq %%T3, %%T6, 8                                   ; shift-L %%T3 2 DWs
+        vpsrldq %%T6, %%T6, 8                                   ; shift-R %%T2 2 DWs
+        vpxor   %%T7, %%T7, %%T3
+        vpxor   %%T1, %%T1, %%T6                                ; accumulate the results in %%T1:%%T7
+
+
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;first phase of the reduction
+        vmovdqu         %%T3, [POLY2]
+
+        vpclmulqdq      %%T2, %%T3, %%T7, 0x01
+        vpslldq         %%T2, %%T2, 8                           ; shift-L xmm2 2 DWs
+
+        vpxor           %%T7, %%T7, %%T2                        ; first phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+    %ifidn %%ENC_DEC, ENC
+        ; Write to the Ciphertext buffer
+        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1
+        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2
+        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3
+        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4
+        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5
+        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6
+        VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7
+        %ifidn %%FULL_PARTIAL, full
+            ;; Avoid writing past the buffer if handling a partial block
+            VXSTR   [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8
+        %endif
+    %endif
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;second phase of the reduction
+        vpclmulqdq      %%T2, %%T3, %%T7, 0x00
+        vpsrldq         %%T2, %%T2, 4                                   ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+        vpclmulqdq      %%T4, %%T3, %%T7, 0x10
+        vpslldq         %%T4, %%T4, 4                                   ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+        vpxor           %%T4, %%T4, %%T2                                ; second phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        vpxor           %%T1, %%T1, %%T4                                ; the result is in %%T1
+
+                vpshufb %%XMM1, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb %%XMM2, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb %%XMM3, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb %%XMM4, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb %%XMM5, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb %%XMM6, [SHUF_MASK]             ; perform a 16Byte swap
+                vpshufb %%XMM7, [SHUF_MASK]             ; perform a 16Byte swap
+        vpshufb %%XMM8, [SHUF_MASK]             ; perform a 16Byte swap
+
+
+        vpxor   %%XMM1, %%T1
+
+
+%endmacro                       ; GHASH_8_ENCRYPT_8_PARALLEL
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro  GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1    %2
+%define %%T2    %3
+%define %%T3    %4
+%define %%T4    %5
+%define %%T5    %6
+%define %%T6    %7
+%define %%T7    %8
+%define %%XMM1  %9
+%define %%XMM2  %10
+%define %%XMM3  %11
+%define %%XMM4  %12
+%define %%XMM5  %13
+%define %%XMM6  %14
+%define %%XMM7  %15
+%define %%XMM8  %16
+
+        ;; Karatsuba Method
+
+        vmovdqu         %%T5, [%%GDATA + HashKey_8]
+
+        vpshufd         %%T2, %%XMM1, 01001110b
+        vpshufd         %%T3, %%T5, 01001110b
+        vpxor           %%T2, %%T2, %%XMM1
+        vpxor           %%T3, %%T3, %%T5
+
+        vpclmulqdq      %%T6, %%XMM1, %%T5, 0x11
+        vpclmulqdq      %%T7, %%XMM1, %%T5, 0x00
+
+        vpclmulqdq      %%XMM1, %%T2, %%T3, 0x00
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+        vmovdqu         %%T5, [%%GDATA + HashKey_7]
+        vpshufd         %%T2, %%XMM2, 01001110b
+        vpshufd         %%T3, %%T5, 01001110b
+        vpxor           %%T2, %%T2, %%XMM2
+        vpxor           %%T3, %%T3, %%T5
+
+        vpclmulqdq      %%T4, %%XMM2, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM2, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+
+        vpxor           %%XMM1, %%XMM1, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+        vmovdqu         %%T5, [%%GDATA + HashKey_6]
+        vpshufd         %%T2, %%XMM3, 01001110b
+        vpshufd         %%T3, %%T5, 01001110b
+        vpxor           %%T2, %%T2, %%XMM3
+        vpxor           %%T3, %%T3, %%T5
+
+        vpclmulqdq      %%T4, %%XMM3, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM3, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+
+        vpxor           %%XMM1, %%XMM1, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+        vmovdqu         %%T5, [%%GDATA + HashKey_5]
+        vpshufd         %%T2, %%XMM4, 01001110b
+        vpshufd         %%T3, %%T5, 01001110b
+        vpxor           %%T2, %%T2, %%XMM4
+        vpxor           %%T3, %%T3, %%T5
+
+        vpclmulqdq      %%T4, %%XMM4, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM4, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+
+        vpxor           %%XMM1, %%XMM1, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+        vmovdqu         %%T5, [%%GDATA + HashKey_4]
+        vpshufd         %%T2, %%XMM5, 01001110b
+        vpshufd         %%T3, %%T5, 01001110b
+        vpxor           %%T2, %%T2, %%XMM5
+        vpxor           %%T3, %%T3, %%T5
+
+        vpclmulqdq      %%T4, %%XMM5, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM5, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+
+        vpxor           %%XMM1, %%XMM1, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+        vmovdqu         %%T5, [%%GDATA + HashKey_3]
+        vpshufd         %%T2, %%XMM6, 01001110b
+        vpshufd         %%T3, %%T5, 01001110b
+        vpxor           %%T2, %%T2, %%XMM6
+        vpxor           %%T3, %%T3, %%T5
+
+        vpclmulqdq      %%T4, %%XMM6, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM6, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+
+        vpxor           %%XMM1, %%XMM1, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+        vmovdqu         %%T5, [%%GDATA + HashKey_2]
+        vpshufd         %%T2, %%XMM7, 01001110b
+        vpshufd         %%T3, %%T5, 01001110b
+        vpxor           %%T2, %%T2, %%XMM7
+        vpxor           %%T3, %%T3, %%T5
+
+        vpclmulqdq      %%T4, %%XMM7, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM7, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+
+        vpxor           %%XMM1, %%XMM1, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+        vmovdqu         %%T5, [%%GDATA + HashKey]
+        vpshufd         %%T2, %%XMM8, 01001110b
+        vpshufd         %%T3, %%T5, 01001110b
+        vpxor           %%T2, %%T2, %%XMM8
+        vpxor           %%T3, %%T3, %%T5
+
+        vpclmulqdq      %%T4, %%XMM8, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM8, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+
+        vpxor           %%XMM1, %%XMM1, %%T2
+        vpxor           %%XMM1, %%XMM1, %%T6
+        vpxor           %%T2, %%XMM1, %%T7
+
+
+
+
+        vpslldq %%T4, %%T2, 8
+        vpsrldq %%T2, %%T2, 8
+
+        vpxor   %%T7, %%T7, %%T4
+        vpxor   %%T6, %%T6, %%T2                               ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;first phase of the reduction
+        vmovdqu         %%T3, [POLY2]
+
+        vpclmulqdq      %%T2, %%T3, %%T7, 0x01
+        vpslldq         %%T2, %%T2, 8                           ; shift-L xmm2 2 DWs
+
+        vpxor           %%T7, %%T7, %%T2                        ; first phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+        ;second phase of the reduction
+        vpclmulqdq      %%T2, %%T3, %%T7, 0x00
+        vpsrldq         %%T2, %%T2, 4                           ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+        vpclmulqdq      %%T4, %%T3, %%T7, 0x10
+        vpslldq         %%T4, %%T4, 4                           ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+        vpxor           %%T4, %%T4, %%T2                        ; second phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        vpxor           %%T6, %%T6, %%T4                        ; the result is in %%T6
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro  GHASH_LAST_7 15
+%define %%GDATA %1
+%define %%T1    %2
+%define %%T2    %3
+%define %%T3    %4
+%define %%T4    %5
+%define %%T5    %6
+%define %%T6    %7
+%define %%T7    %8
+%define %%XMM1  %9
+%define %%XMM2  %10
+%define %%XMM3  %11
+%define %%XMM4  %12
+%define %%XMM5  %13
+%define %%XMM6  %14
+%define %%XMM7  %15
+
+        ;; Karatsuba Method
+
+        vmovdqu         %%T5, [%%GDATA + HashKey_7]
+
+        vpshufd         %%T2, %%XMM1, 01001110b
+        vpshufd         %%T3, %%T5, 01001110b
+        vpxor           %%T2, %%T2, %%XMM1
+        vpxor           %%T3, %%T3, %%T5
+
+        vpclmulqdq      %%T6, %%XMM1, %%T5, 0x11
+        vpclmulqdq      %%T7, %%XMM1, %%T5, 0x00
+
+        vpclmulqdq      %%XMM1, %%T2, %%T3, 0x00
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+        vmovdqu         %%T5, [%%GDATA + HashKey_6]
+        vpshufd         %%T2, %%XMM2, 01001110b
+        vpshufd         %%T3, %%T5, 01001110b
+        vpxor           %%T2, %%T2, %%XMM2
+        vpxor           %%T3, %%T3, %%T5
+
+        vpclmulqdq      %%T4, %%XMM2, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM2, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+
+        vpxor           %%XMM1, %%XMM1, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+        vmovdqu         %%T5, [%%GDATA + HashKey_5]
+        vpshufd         %%T2, %%XMM3, 01001110b
+        vpshufd         %%T3, %%T5, 01001110b
+        vpxor           %%T2, %%T2, %%XMM3
+        vpxor           %%T3, %%T3, %%T5
+
+        vpclmulqdq      %%T4, %%XMM3, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM3, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+
+        vpxor           %%XMM1, %%XMM1, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+        vmovdqu         %%T5, [%%GDATA + HashKey_4]
+        vpshufd         %%T2, %%XMM4, 01001110b
+        vpshufd         %%T3, %%T5, 01001110b
+        vpxor           %%T2, %%T2, %%XMM4
+        vpxor           %%T3, %%T3, %%T5
+
+        vpclmulqdq      %%T4, %%XMM4, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM4, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+
+        vpxor           %%XMM1, %%XMM1, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+        vmovdqu         %%T5, [%%GDATA + HashKey_3]
+        vpshufd         %%T2, %%XMM5, 01001110b
+        vpshufd         %%T3, %%T5, 01001110b
+        vpxor           %%T2, %%T2, %%XMM5
+        vpxor           %%T3, %%T3, %%T5
+
+        vpclmulqdq      %%T4, %%XMM5, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM5, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+
+        vpxor           %%XMM1, %%XMM1, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+        vmovdqu         %%T5, [%%GDATA + HashKey_2]
+        vpshufd         %%T2, %%XMM6, 01001110b
+        vpshufd         %%T3, %%T5, 01001110b
+        vpxor           %%T2, %%T2, %%XMM6
+        vpxor           %%T3, %%T3, %%T5
+
+        vpclmulqdq      %%T4, %%XMM6, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM6, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+
+        vpxor           %%XMM1, %%XMM1, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+        vmovdqu         %%T5, [%%GDATA + HashKey_1]
+        vpshufd         %%T2, %%XMM7, 01001110b
+        vpshufd         %%T3, %%T5, 01001110b
+        vpxor           %%T2, %%T2, %%XMM7
+        vpxor           %%T3, %%T3, %%T5
+
+        vpclmulqdq      %%T4, %%XMM7, %%T5, 0x11
+        vpxor           %%T6, %%T6, %%T4
+
+        vpclmulqdq      %%T4, %%XMM7, %%T5, 0x00
+        vpxor           %%T7, %%T7, %%T4
+
+        vpclmulqdq      %%T2, %%T2, %%T3, 0x00
+
+        vpxor           %%XMM1, %%XMM1, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;
+
+        vpxor           %%XMM1, %%XMM1, %%T6
+        vpxor           %%T2, %%XMM1, %%T7
+
+
+
+
+        vpslldq %%T4, %%T2, 8
+        vpsrldq %%T2, %%T2, 8
+
+        vpxor   %%T7, %%T7, %%T4
+        vpxor   %%T6, %%T6, %%T2                               ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;first phase of the reduction
+        vmovdqu         %%T3, [POLY2]
+
+        vpclmulqdq      %%T2, %%T3, %%T7, 0x01
+        vpslldq         %%T2, %%T2, 8                           ; shift-L xmm2 2 DWs
+
+        vpxor           %%T7, %%T7, %%T2                        ; first phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+        ;second phase of the reduction
+        vpclmulqdq      %%T2, %%T3, %%T7, 0x00
+        vpsrldq         %%T2, %%T2, 4                           ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+        vpclmulqdq      %%T4, %%T3, %%T7, 0x10
+        vpslldq         %%T4, %%T4, 4                           ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+        vpxor           %%T4, %%T4, %%T2                        ; second phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        vpxor           %%T6, %%T6, %%T4                        ; the result is in %%T6
+%endmacro
+
+
+
+;;; Handle encryption of the final partial block
+;;; IN:
+;;;   r13  - Number of bytes to read
+;;; MODIFIES:
+;;;   KEY  - Key for encrypting the partial block
+;;;   HASH - Current hash value
+;;; SMASHES:
+;;;   r10, r12, r15, rax
+;;;   T1, T2
+;;; Note:
+;;;   PLAIN_CYPH_LEN, %7, is passed only to determine
+;;;   if buffer is big enough to do a 16 byte read & shift.
+;;;     'LT16' is passed here only if buffer is known to be smaller
+;;;     than 16 bytes.
+;;;     Any other value passed here will result in 16 byte read
+;;;     code path.
+;;; TBD: Remove HASH from the instantiation
+%macro  ENCRYPT_FINAL_PARTIAL_BLOCK 8
+%define %%KEY             %1
+%define %%T1              %2
+%define %%T2              %3
+%define %%CYPH_PLAIN_OUT  %4
+%define %%PLAIN_CYPH_IN   %5
+%define %%PLAIN_CYPH_LEN  %6
+%define %%ENC_DEC         %7
+%define %%DATA_OFFSET     %8
+
+        ;; NOTE: type of read tuned based %%PLAIN_CYPH_LEN setting
+%ifidn %%PLAIN_CYPH_LEN, LT16
+        ;; Handle the case where the message is < 16 bytes
+        lea      r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+
+        ;; T1            - packed output
+        ;; r10           - input data address
+        ;; r13           - input data length
+        ;; r12, r15, rax - temp registers
+        READ_SMALL_DATA_INPUT   %%T1, r10, r13, r12, r15, rax
+
+        lea      r12, [SHIFT_MASK + 16]
+        sub      r12, r13
+%else
+        ;; Handle the case where the message is >= 16 bytes
+        sub      %%DATA_OFFSET, 16
+        add      %%DATA_OFFSET, r13
+        ;; Receive the last <16 Byte block
+        vmovdqu  %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET]
+        sub      %%DATA_OFFSET, r13
+        add      %%DATA_OFFSET, 16
+
+        lea      r12, [SHIFT_MASK + 16]
+        ;; Adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+        ;; (r13 is the number of bytes in plaintext mod 16)
+        sub      r12, r13
+        ;; Get the appropriate shuffle mask
+        vmovdqu  %%T2, [r12]
+        ;; shift right 16-r13 bytes
+        vpshufb  %%T1, %%T2
+%endif                          ; %%PLAIN_CYPH_LEN, LT16
+
+        ;; At this point T1 contains the partial block data
+%ifidn  %%ENC_DEC, DEC
+        ;; Plaintext XOR E(K, Yn)
+        ;; Set aside the ciphertext
+        vmovdqa  %%T2, %%T1
+        vpxor    %%KEY, %%KEY, %%T1
+        ;; Get the appropriate mask to mask out top 16-r13 bytes of ciphertext
+        vmovdqu  %%T1, [r12 + ALL_F - SHIFT_MASK]
+        ;; Mask out top 16-r13 bytes of ciphertext
+        vpand    %%KEY, %%KEY, %%T1
+
+        ;; Prepare the ciphertext for the hash
+        ;; mask out top 16-r13 bytes of the plaintext
+        vpand    %%T2, %%T2, %%T1
+%else
+        ;; Plaintext XOR E(K, Yn)
+        vpxor    %%KEY, %%KEY, %%T1
+        ;; Get the appropriate mask to mask out top 16-r13 bytes of %%KEY
+        vmovdqu  %%T1, [r12 + ALL_F - SHIFT_MASK]
+        ;; Mask out top 16-r13 bytes of %%KEY
+        vpand    %%KEY, %%KEY, %%T1
+%endif
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; Output r13 Bytes
+        vmovq   rax, %%KEY
+        cmp     r13, 8
+        jle     %%_less_than_8_bytes_left
+
+        mov     [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+        add     %%DATA_OFFSET, 8
+        vpsrldq %%T1, %%KEY, 8
+        vmovq   rax, %%T1
+        sub     r13, 8
+
+%%_less_than_8_bytes_left:
+        mov     BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+        add     %%DATA_OFFSET, 1
+        shr     rax, 8
+        sub     r13, 1
+        jne     %%_less_than_8_bytes_left
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn  %%ENC_DEC, DEC
+        ;; If decrypt, restore the ciphertext into %%KEY
+        vmovdqu %%KEY, %%T2
+%endif
+%endmacro                       ; ENCRYPT_FINAL_PARTIAL_BLOCK
+
+
+
+; Encryption of a single block
+%macro  ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0  %2
+
+                vpxor    %%XMM0, %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep NROUNDS
+                vaesenc  %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+                vaesenclast      %%XMM0, [%%GDATA+16*i]
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+        ;; Required for Update/GMC_ENC
+        ;the number of pushes must equal STACK_OFFSET
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+        push    rsi
+        mov     r14, rsp
+
+        sub     rsp, VARIABLE_OFFSET
+        and     rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+        ; xmm6:xmm15 need to be maintained for Windows
+        vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+        vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+        vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+        vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+        vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+        vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+        vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+        vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+        vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+        vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+
+        mov	arg5, arg(5) ;[r14 + STACK_OFFSET + 8*5]
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+        vmovdqu xmm15, [rsp + LOCAL_STORAGE + 9*16]
+        vmovdqu xmm14, [rsp + LOCAL_STORAGE + 8*16]
+        vmovdqu xmm13, [rsp + LOCAL_STORAGE + 7*16]
+        vmovdqu xmm12, [rsp + LOCAL_STORAGE + 6*16]
+        vmovdqu xmm11, [rsp + LOCAL_STORAGE + 5*16]
+        vmovdqu xmm10, [rsp + LOCAL_STORAGE + 4*16]
+        vmovdqu xmm9, [rsp + LOCAL_STORAGE + 3*16]
+        vmovdqu xmm8, [rsp + LOCAL_STORAGE + 2*16]
+        vmovdqu xmm7, [rsp + LOCAL_STORAGE + 1*16]
+        vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+        mov     rsp, r14
+	pop     rsi
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
+; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
+; Additional Authentication data (A_IN), Additional Data length (A_LEN).
+; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX.
+; Clobbers rax, r10-r13, and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro  GCM_INIT        5
+%define %%GDATA_KEY     %1
+%define %%GDATA_CTX     %2
+%define %%IV            %3
+%define %%A_IN          %4
+%define %%A_LEN         %5
+%define %%AAD_HASH      xmm14
+%define %%SUBHASH       xmm1
+
+
+        vmovdqu %%SUBHASH, [%%GDATA_KEY + HashKey]
+
+        mov     r10, %%A_LEN
+        cmp     r10, 0
+        je      %%_aad_is_zero
+
+        CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+        jmp     %%_after_aad
+
+%%_aad_is_zero:
+        vpxor   %%AAD_HASH, %%AAD_HASH
+
+%%_after_aad:
+        mov     r10, %%A_LEN
+        vpxor   xmm2, xmm3
+
+        vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH         ; ctx_data.aad hash = aad_hash
+        mov     [%%GDATA_CTX + AadLen], r10                 ; ctx_data.aad_length = aad_length
+        xor     r10, r10
+        mov     [%%GDATA_CTX + InLen], r10                  ; ctx_data.in_length = 0
+        mov     [%%GDATA_CTX + PBlockLen], r10              ; ctx_data.partial_block_length = 0
+        vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2          ; ctx_data.partial_block_enc_key = 0
+        mov     r10, %%IV
+        vmovdqa xmm2, [rel ONEf]                            ; read 12 IV bytes and pad with 0x00000001
+        vpinsrq xmm2, [r10], 0
+        vpinsrd xmm2, [r10+8], 2
+        vmovdqu [%%GDATA_CTX + OrigIV], xmm2                ; ctx_data.orig_IV = iv
+
+        vpshufb xmm2, [SHUF_MASK]
+
+        vmovdqu [%%GDATA_CTX + CurCount], xmm2              ; ctx_data.current_counter = iv
+%endmacro
+
+%macro  GCM_ENC_DEC_SMALL   12
+%define %%GDATA_KEY         %1
+%define %%GDATA_CTX         %2
+%define %%CYPH_PLAIN_OUT    %3
+%define %%PLAIN_CYPH_IN     %4
+%define %%PLAIN_CYPH_LEN    %5
+%define %%ENC_DEC           %6
+%define %%DATA_OFFSET       %7
+%define %%LENGTH            %8
+%define %%NUM_BLOCKS        %9
+%define %%CTR               %10
+%define %%HASH              %11
+%define %%INSTANCE_TYPE     %12
+
+        ;; NOTE: the check below is obsolete in current implementation. The check is already done in GCM_ENC_DEC.
+        ;; cmp     %%NUM_BLOCKS, 0
+        ;; je      %%_small_initial_blocks_encrypted
+        cmp     %%NUM_BLOCKS, 8
+        je      %%_small_initial_num_blocks_is_8
+        cmp     %%NUM_BLOCKS, 7
+        je      %%_small_initial_num_blocks_is_7
+        cmp     %%NUM_BLOCKS, 6
+        je      %%_small_initial_num_blocks_is_6
+        cmp     %%NUM_BLOCKS, 5
+        je      %%_small_initial_num_blocks_is_5
+        cmp     %%NUM_BLOCKS, 4
+        je      %%_small_initial_num_blocks_is_4
+        cmp     %%NUM_BLOCKS, 3
+        je      %%_small_initial_num_blocks_is_3
+        cmp     %%NUM_BLOCKS, 2
+        je      %%_small_initial_num_blocks_is_2
+
+        jmp     %%_small_initial_num_blocks_is_1
+
+
+%%_small_initial_num_blocks_is_8:
+        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 8, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+        jmp     %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_7:
+        ;; r13   - %%LENGTH
+        ;; xmm12 - T1
+        ;; xmm13 - T2
+        ;; xmm14 - T3   - AAD HASH OUT when not producing 8 AES keys
+        ;; xmm15 - T4
+        ;; xmm11 - T5
+        ;; xmm9  - CTR
+        ;; xmm1  - XMM1 - Cipher + Hash when producing 8 AES keys
+        ;; xmm2  - XMM2
+        ;; xmm3  - XMM3
+        ;; xmm4  - XMM4
+        ;; xmm5  - XMM5
+        ;; xmm6  - XMM6
+        ;; xmm7  - XMM7
+        ;; xmm8  - XMM8 - AAD HASH IN
+        ;; xmm10 - T6
+        ;; xmm0  - T_key
+        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+        jmp     %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_6:
+        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+        jmp     %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_5:
+        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+        jmp     %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_4:
+        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+        jmp     %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_3:
+        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+        jmp     %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_2:
+        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+        jmp     %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_1:
+        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+
+        ;; Note: zero initial blocks not allowed.
+
+%%_small_initial_blocks_encrypted:
+
+%endmacro                       ; GCM_ENC_DEC_SMALL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
+; has been initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC).
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro  GCM_ENC_DEC         7
+%define %%GDATA_KEY         %1
+%define %%GDATA_CTX         %2
+%define %%CYPH_PLAIN_OUT    %3
+%define %%PLAIN_CYPH_IN     %4
+%define %%PLAIN_CYPH_LEN    %5
+%define %%ENC_DEC           %6
+%define %%INSTANCE_TYPE     %7
+%define %%DATA_OFFSET       r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+        cmp     %%PLAIN_CYPH_LEN, 0
+        je      %%_enc_dec_done
+
+        xor     %%DATA_OFFSET, %%DATA_OFFSET
+        ;; Update length of data processed
+        add    [%%GDATA_CTX+InLen], %%PLAIN_CYPH_LEN
+        vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+        vmovdqu xmm8, [%%GDATA_CTX + AadHash]
+
+%ifidn %%INSTANCE_TYPE, multi_call
+        ;; NOTE: partial block processing makes only sense for multi_call here.
+        ;; Used for the update flow - if there was a previous partial
+        ;; block fill the remaining bytes here.
+        PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+%endif
+
+        ;;  lift CTR set from initial_blocks to here
+%ifidn %%INSTANCE_TYPE, single_call
+        vmovdqu xmm9, xmm2
+%else
+        vmovdqu xmm9, [%%GDATA_CTX + CurCount]
+%endif
+
+        ;; Save the amount of data left to process in r10
+        mov     r13, %%PLAIN_CYPH_LEN
+%ifidn %%INSTANCE_TYPE, multi_call
+        ;; NOTE: %%DATA_OFFSET is zero in single_call case.
+        ;;      Consequently PLAIN_CYPH_LEN will never be zero after
+        ;;      %%DATA_OFFSET subtraction below.
+        sub     r13, %%DATA_OFFSET
+
+        ;; There may be no more data if it was consumed in the partial block.
+        cmp     r13, 0
+        je      %%_enc_dec_done
+%endif                          ; %%INSTANCE_TYPE, multi_call
+        mov     r10, r13
+
+        ;; Determine how many blocks to process in INITIAL
+        mov     r12, r13
+        shr     r12, 4
+        and     r12, 7
+
+        ;; Process one additional block in INITIAL if there is a partial block
+        and     r10, 0xf
+        blsmsk  r10, r10    ; Set CF if zero
+        cmc                 ; Flip CF
+        adc     r12, 0x0    ; Process an additional INITIAL block if CF set
+
+        ;;      Less than 127B will be handled by the small message code, which
+        ;;      can process up to 7 16B blocks.
+        cmp     r13, 128
+        jge     %%_large_message_path
+
+        GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, r13, r12, xmm9, xmm14, %%INSTANCE_TYPE
+        jmp     %%_ghash_done
+
+%%_large_message_path:
+        and     r12, 0x7    ; Still, don't allow 8 INITIAL blocks since this will
+                            ; can be handled by the x8 partial loop.
+
+        cmp     r12, 0
+        je      %%_initial_num_blocks_is_0
+        cmp     r12, 7
+        je      %%_initial_num_blocks_is_7
+        cmp     r12, 6
+        je      %%_initial_num_blocks_is_6
+        cmp     r12, 5
+        je      %%_initial_num_blocks_is_5
+        cmp     r12, 4
+        je      %%_initial_num_blocks_is_4
+        cmp     r12, 3
+        je      %%_initial_num_blocks_is_3
+        cmp     r12, 2
+        je      %%_initial_num_blocks_is_2
+
+        jmp     %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+        ;; r13   - %%LENGTH
+        ;; xmm12 - T1
+        ;; xmm13 - T2
+        ;; xmm14 - T3   - AAD HASH OUT when not producing 8 AES keys
+        ;; xmm15 - T4
+        ;; xmm11 - T5
+        ;; xmm9  - CTR
+        ;; xmm1  - XMM1 - Cipher + Hash when producing 8 AES keys
+        ;; xmm2  - XMM2
+        ;; xmm3  - XMM3
+        ;; xmm4  - XMM4
+        ;; xmm5  - XMM5
+        ;; xmm6  - XMM6
+        ;; xmm7  - XMM7
+        ;; xmm8  - XMM8 - AAD HASH IN
+        ;; xmm10 - T6
+        ;; xmm0  - T_key
+        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_3:
+        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_2:
+        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+        INITIAL_BLOCKS  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+        ;; The entire message was encrypted processed in initial and now need to be hashed
+        cmp     r13, 0
+        je      %%_encrypt_done
+
+        ;; Encrypt the final <16 byte (partial) block, then hash
+        cmp     r13, 16
+        jl      %%_encrypt_final_partial
+
+        ;; Process 7 full blocks plus a partial block
+        cmp     r13, 128
+        jl      %%_encrypt_by_8_partial
+
+
+%%_encrypt_by_8_parallel:
+        ;; in_order vs. out_order is an optimization to increment the counter without shuffling
+        ;; it back into little endian. r15d keeps track of when we need to increent in order so
+        ;; that the carry is handled correctly.
+        vmovd   r15d, xmm9
+        and     r15d, 255
+        vpshufb xmm9, [rel SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+        cmp     r15d, 255-8
+        jg      %%_encrypt_by_8
+
+
+
+        ;; xmm0  - T1
+        ;; xmm10 - T2
+        ;; xmm11 - T3
+        ;; xmm12 - T4
+        ;; xmm13 - T5
+        ;; xmm14 - T6
+        ;; xmm9  - CTR
+        ;; xmm1  - XMM1
+        ;; xmm2  - XMM2
+        ;; xmm3  - XMM3
+        ;; xmm4  - XMM4
+        ;; xmm5  - XMM5
+        ;; xmm6  - XMM6
+        ;; xmm7  - XMM7
+        ;; xmm8  - XMM8
+        ;; xmm15 - T7
+        add     r15b, 8
+        GHASH_8_ENCRYPT_8_PARALLEL  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC, full
+        add     %%DATA_OFFSET, 128
+        sub     r13, 128
+        cmp     r13, 128
+        jge     %%_encrypt_by_8_new
+
+        vpshufb xmm9, [SHUF_MASK]
+        jmp     %%_encrypt_by_8_parallel_done
+
+%%_encrypt_by_8:
+        vpshufb xmm9, [SHUF_MASK]
+        add     r15b, 8
+        GHASH_8_ENCRYPT_8_PARALLEL  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, full
+        vpshufb  xmm9, [SHUF_MASK]
+        add     %%DATA_OFFSET, 128
+        sub     r13, 128
+        cmp     r13, 128
+        jge     %%_encrypt_by_8_new
+        vpshufb  xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_parallel_done:
+        ;; Test to see if we need a by 8 with partial block. At this point
+        ;; bytes remaining should be either zero or between 113-127.
+        cmp     r13, 0
+        je      %%_encrypt_done
+
+%%_encrypt_by_8_partial:
+        ;; Shuffle needed to align key for partial block xor. out_order
+        ;; is a little faster because it avoids extra shuffles.
+        ;; TBD: Might need to account for when we don't have room to increment the counter.
+
+
+        ;; Process parallel buffers with a final partial block.
+        GHASH_8_ENCRYPT_8_PARALLEL  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, partial
+
+
+        add     %%DATA_OFFSET, 128-16
+        sub     r13, 128-16
+
+%%_encrypt_final_partial:
+
+        vpshufb  xmm8, [SHUF_MASK]
+        mov     [%%GDATA_CTX + PBlockLen], r13
+        vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm8
+
+        ;; xmm8  - Final encrypted counter - need to hash with partial or full block ciphertext
+        ;;                            GDATA,  KEY,   T1,    T2
+        ENCRYPT_FINAL_PARTIAL_BLOCK xmm8, xmm0, xmm10, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET
+
+        vpshufb  xmm8, [SHUF_MASK]
+
+
+%%_encrypt_done:
+
+        ;; Mapping to macro parameters
+        ;; IN:
+        ;;   xmm9 contains the counter
+        ;;   xmm1-xmm8 contain the xor'd ciphertext
+        ;; OUT:
+        ;;   xmm14 contains the final hash
+        ;;             GDATA,   T1,    T2,    T3,    T4,    T5,    T6,    T7, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+%ifidn %%INSTANCE_TYPE, multi_call
+        mov     r13, [%%GDATA_CTX + PBlockLen]
+        cmp     r13, 0
+        jz      %%_hash_last_8
+        GHASH_LAST_7 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+        ;; XOR the partial word into the hash
+        vpxor   xmm14, xmm14, xmm8
+        jmp     %%_ghash_done
+%endif
+%%_hash_last_8:
+        GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+%%_ghash_done:
+        vmovdqu [%%GDATA_CTX + CurCount], xmm9  ; my_ctx_data.current_counter = xmm9
+        vmovdqu [%%GDATA_CTX + AadHash], xmm14      ; my_ctx_data.aad hash = xmm14
+
+%%_enc_dec_done:
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro  GCM_COMPLETE            6
+%define %%GDATA_KEY             %1
+%define %%GDATA_CTX             %2
+%define %%AUTH_TAG              %3
+%define %%AUTH_TAG_LEN          %4
+%define %%ENC_DEC               %5
+%define %%INSTANCE_TYPE         %6
+%define %%PLAIN_CYPH_LEN        rax
+
+        vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+        ;; Start AES as early as possible
+        vmovdqu xmm9, [%%GDATA_CTX + OrigIV]    ; xmm9 = Y0
+        ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9  ; E(K, Y0)
+
+%ifidn %%INSTANCE_TYPE, multi_call
+        ;; If the GCM function is called as a single function call rather
+        ;; than invoking the individual parts (init, update, finalize) we
+        ;; can remove a write to read dependency on AadHash.
+        vmovdqu xmm14, [%%GDATA_CTX + AadHash]
+
+        ;; Encrypt the final partial block. If we did this as a single call then
+        ;; the partial block was handled in the main GCM_ENC_DEC macro.
+	mov	r12, [%%GDATA_CTX + PBlockLen]
+	cmp	r12, 0
+
+	je %%_partial_done
+
+	GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+	vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+%%_partial_done:
+
+%endif
+
+        mov     r12, [%%GDATA_CTX + AadLen]     ; r12 = aadLen (number of bytes)
+        mov     %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
+
+        shl     r12, 3                      ; convert into number of bits
+        vmovd   xmm15, r12d                 ; len(A) in xmm15
+
+        shl     %%PLAIN_CYPH_LEN, 3         ; len(C) in bits  (*128)
+        vmovq   xmm1, %%PLAIN_CYPH_LEN
+        vpslldq xmm15, xmm15, 8             ; xmm15 = len(A)|| 0x0000000000000000
+        vpxor   xmm15, xmm15, xmm1          ; xmm15 = len(A)||len(C)
+
+        vpxor   xmm14, xmm15
+        GHASH_MUL       xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6
+        vpshufb  xmm14, [SHUF_MASK]         ; perform a 16Byte swap
+
+        vpxor   xmm9, xmm9, xmm14
+
+
+%%_return_T:
+        mov     r10, %%AUTH_TAG             ; r10 = authTag
+        mov     r11, %%AUTH_TAG_LEN         ; r11 = auth_tag_len
+
+        cmp     r11, 16
+        je      %%_T_16
+
+        cmp     r11, 12
+        je      %%_T_12
+
+%%_T_8:
+        vmovq    rax, xmm9
+        mov     [r10], rax
+        jmp     %%_return_T_done
+%%_T_12:
+        vmovq    rax, xmm9
+        mov     [r10], rax
+        vpsrldq xmm9, xmm9, 8
+        vmovd    eax, xmm9
+        mov     [r10 + 8], eax
+        jmp     %%_return_T_done
+
+%%_T_16:
+        vmovdqu  [r10], xmm9
+
+%%_return_T_done:
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_precomp_128_avx_gen4 /
+;       aes_gcm_precomp_192_avx_gen4 /
+;       aes_gcm_precomp_256_avx_gen4
+;       (struct gcm_key_data *key_data)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(precomp,_)
+FN_NAME(precomp,_):
+	endbranch
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+
+        mov     r14, rsp
+
+
+
+        sub     rsp, VARIABLE_OFFSET
+        and     rsp, ~63                                 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+        ; only xmm6 needs to be maintained
+        vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+        vpxor   xmm6, xmm6
+        ENCRYPT_SINGLE_BLOCK    arg1, xmm6              ; xmm6 = HashKey
+
+        vpshufb  xmm6, [rel SHUF_MASK]
+        ;;;;;;;;;;;;;;;  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+        vmovdqa  xmm2, xmm6
+        vpsllq   xmm6, xmm6, 1
+        vpsrlq   xmm2, xmm2, 63
+        vmovdqa  xmm1, xmm2
+        vpslldq  xmm2, xmm2, 8
+        vpsrldq  xmm1, xmm1, 8
+        vpor     xmm6, xmm6, xmm2
+        ;reduction
+        vpshufd  xmm2, xmm1, 00100100b
+        vpcmpeqd xmm2, [TWOONE]
+        vpand    xmm2, xmm2, [POLY]
+        vpxor    xmm6, xmm6, xmm2                       ; xmm6 holds the HashKey<<1 mod poly
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        vmovdqu  [arg1 + HashKey], xmm6                 ; store HashKey<<1 mod poly
+
+
+        PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+        vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+        mov     rsp, r14
+
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        ret
+%endif	; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_init_128_avx_gen4 / aes_gcm_init_192_avx_gen4 / aes_gcm_init_256_avx_gen4
+;       (const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8       *iv,
+;        const u8 *aad,
+;        u64      aad_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(init,_)
+FN_NAME(init,_):
+	endbranch
+        push    r12
+        push    r13
+%ifidn __OUTPUT_FORMAT__, win64
+        ; xmm6:xmm15 need to be maintained for Windows
+        push    arg5
+        sub     rsp, 1*16
+        vmovdqu [rsp + 0*16],xmm6
+        mov     arg5, [rsp + 1*16 + 8*3 + 8*5]
+%endif
+
+        GCM_INIT arg1, arg2, arg3, arg4, arg5
+
+%ifidn __OUTPUT_FORMAT__, win64
+        vmovdqu xmm6, [rsp + 0*16]
+        add     rsp, 1*16
+        pop     arg5
+%endif
+        pop     r13
+        pop     r12
+        ret
+%endif	; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_enc_128_update_avx_gen4 / aes_gcm_enc_192_update_avx_gen4 /
+;       aes_gcm_enc_128_update_avx_gen4
+;       (const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8       *out,
+;        const u8 *in,
+;        u64      plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(enc,_update_)
+FN_NAME(enc,_update_):
+	endbranch
+
+        FUNC_SAVE
+
+        GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call
+
+        FUNC_RESTORE
+
+        ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_dec_128_update_avx_gen4 / aes_gcm_dec_192_update_avx_gen4 /
+;       aes_gcm_dec_256_update_avx_gen4
+;       (const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8       *out,
+;        const u8 *in,
+;        u64      plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(dec,_update_)
+FN_NAME(dec,_update_):
+	endbranch
+
+        FUNC_SAVE
+
+        GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call
+
+        FUNC_RESTORE
+
+        ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_enc_128_finalize_avx_gen4 / aes_gcm_enc_192_finalize_avx_gen4 /
+;	aes_gcm_enc_256_finalize_avx_gen4
+;       (const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8       *auth_tag,
+;        u64      auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(enc,_finalize_)
+FN_NAME(enc,_finalize_):
+	endbranch
+
+        push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+        ; xmm6:xmm15 need to be maintained for Windows
+        sub     rsp, 5*16
+        vmovdqu [rsp + 0*16], xmm6
+        vmovdqu [rsp + 1*16], xmm9
+        vmovdqu [rsp + 2*16], xmm11
+        vmovdqu [rsp + 3*16], xmm14
+        vmovdqu [rsp + 4*16], xmm15
+%endif
+        GCM_COMPLETE    arg1, arg2, arg3, arg4, ENC, multi_call
+
+%ifidn __OUTPUT_FORMAT__, win64
+        vmovdqu xmm15, [rsp + 4*16]
+        vmovdqu xmm14, [rsp + 3*16]
+        vmovdqu xmm11, [rsp + 2*16]
+        vmovdqu xmm9, [rsp + 1*16]
+        vmovdqu xmm6, [rsp + 0*16]
+        add     rsp, 5*16
+%endif
+
+        pop r12
+ret
+%endif	; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_dec_128_finalize_avx_gen4 / aes_gcm_dec_192_finalize_avx_gen4
+;	aes_gcm_dec_256_finalize_avx_gen4
+;       (const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8       *auth_tag,
+;        u64      auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(dec,_finalize_)
+FN_NAME(dec,_finalize_):
+	endbranch
+
+        push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+        ; xmm6:xmm15 need to be maintained for Windows
+        sub     rsp, 5*16
+        vmovdqu [rsp + 0*16], xmm6
+        vmovdqu [rsp + 1*16], xmm9
+        vmovdqu [rsp + 2*16], xmm11
+        vmovdqu [rsp + 3*16], xmm14
+        vmovdqu [rsp + 4*16], xmm15
+%endif
+        GCM_COMPLETE    arg1, arg2, arg3, arg4, DEC, multi_call
+
+%ifidn __OUTPUT_FORMAT__, win64
+        vmovdqu xmm15, [rsp + 4*16]
+        vmovdqu xmm14, [rsp + 3*16]
+        vmovdqu xmm11, [rsp + 2*16]
+        vmovdqu xmm9, [rsp + 1*16]
+        vmovdqu xmm6, [rsp + 0*16]
+        add     rsp, 5*16
+%endif
+
+        pop r12
+        ret
+%endif	; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_enc_128_avx_gen4 / aes_gcm_enc_192_avx_gen4 / aes_gcm_enc_256_avx_gen4
+;       (const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8       *out,
+;        const u8 *in,
+;        u64      plaintext_len,
+;        u8       *iv,
+;        const u8 *aad,
+;        u64      aad_len,
+;        u8       *auth_tag,
+;        u64      auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(enc,_)
+FN_NAME(enc,_):
+	endbranch
+
+        FUNC_SAVE
+
+        GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+        GCM_ENC_DEC  arg1, arg2, arg3, arg4, arg5, ENC, single_call
+
+        GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call
+
+        FUNC_RESTORE
+
+        ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_dec_128_avx_gen4 / aes_gcm_dec_192_avx_gen4 / aes_gcm_dec_256_avx_gen4
+;       (const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8       *out,
+;        const u8 *in,
+;        u64      plaintext_len,
+;        u8       *iv,
+;        const u8 *aad,
+;        u64      aad_len,
+;        u8       *auth_tag,
+;        u64      auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(dec,_)
+FN_NAME(dec,_):
+	endbranch
+
+        FUNC_SAVE
+
+        GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+        GCM_ENC_DEC  arg1, arg2, arg3, arg4, arg5, DEC, single_call
+
+        GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call
+
+        FUNC_RESTORE
+
+        ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm
new file mode 100644
index 000000000..e823b7959
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm
@@ -0,0 +1,291 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef GCM_DEFINES_ASM_INCLUDED
+%define GCM_DEFINES_ASM_INCLUDED
+
+;
+; Authors:
+;       Erdinc Ozturk
+;       Vinodh Gopal
+;       James Guilford
+
+
+;;;;;;
+
+section .data
+
+align 16
+
+POLY            dq     0x0000000000000001, 0xC200000000000000
+
+align 64
+POLY2           dq     0x00000001C2000000, 0xC200000000000000
+                dq     0x00000001C2000000, 0xC200000000000000
+                dq     0x00000001C2000000, 0xC200000000000000
+                dq     0x00000001C2000000, 0xC200000000000000
+align 16
+TWOONE          dq     0x0000000000000001, 0x0000000100000000
+
+; order of these constants should not change.
+; more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
+
+align 64
+SHUF_MASK       dq     0x08090A0B0C0D0E0F, 0x0001020304050607
+                dq     0x08090A0B0C0D0E0F, 0x0001020304050607
+                dq     0x08090A0B0C0D0E0F, 0x0001020304050607
+                dq     0x08090A0B0C0D0E0F, 0x0001020304050607
+
+SHIFT_MASK      dq     0x0706050403020100, 0x0f0e0d0c0b0a0908
+ALL_F           dq     0xffffffffffffffff, 0xffffffffffffffff
+ZERO            dq     0x0000000000000000, 0x0000000000000000
+ONE             dq     0x0000000000000001, 0x0000000000000000
+TWO             dq     0x0000000000000002, 0x0000000000000000
+ONEf            dq     0x0000000000000000, 0x0100000000000000
+TWOf            dq     0x0000000000000000, 0x0200000000000000
+
+align 64
+ddq_add_1234:
+        dq	0x0000000000000001, 0x0000000000000000
+        dq	0x0000000000000002, 0x0000000000000000
+        dq	0x0000000000000003, 0x0000000000000000
+        dq	0x0000000000000004, 0x0000000000000000
+
+align 64
+ddq_add_5678:
+        dq	0x0000000000000005, 0x0000000000000000
+        dq	0x0000000000000006, 0x0000000000000000
+        dq	0x0000000000000007, 0x0000000000000000
+        dq	0x0000000000000008, 0x0000000000000000
+
+align 64
+ddq_add_4444:
+        dq	0x0000000000000004, 0x0000000000000000
+        dq	0x0000000000000004, 0x0000000000000000
+        dq	0x0000000000000004, 0x0000000000000000
+        dq	0x0000000000000004, 0x0000000000000000
+
+align 64
+ddq_add_8888:
+        dq	0x0000000000000008, 0x0000000000000000
+        dq	0x0000000000000008, 0x0000000000000000
+        dq	0x0000000000000008, 0x0000000000000000
+        dq	0x0000000000000008, 0x0000000000000000
+
+align 64
+ddq_addbe_1234:
+        dq	0x0000000000000000, 0x0100000000000000
+        dq	0x0000000000000000, 0x0200000000000000
+        dq	0x0000000000000000, 0x0300000000000000
+        dq	0x0000000000000000, 0x0400000000000000
+
+align 64
+ddq_addbe_5678:
+        dq	0x0000000000000000, 0x0500000000000000
+        dq	0x0000000000000000, 0x0600000000000000
+        dq	0x0000000000000000, 0x0700000000000000
+        dq	0x0000000000000000, 0x0800000000000000
+
+align 64
+ddq_addbe_4444:
+        dq	0x0000000000000000, 0x0400000000000000
+        dq	0x0000000000000000, 0x0400000000000000
+        dq	0x0000000000000000, 0x0400000000000000
+        dq	0x0000000000000000, 0x0400000000000000
+
+align 64
+ddq_addbe_8888:
+        dq	0x0000000000000000, 0x0800000000000000
+        dq	0x0000000000000000, 0x0800000000000000
+        dq	0x0000000000000000, 0x0800000000000000
+        dq	0x0000000000000000, 0x0800000000000000
+
+align 64
+byte_len_to_mask_table:
+        dw      0x0000, 0x0001, 0x0003, 0x0007,
+        dw      0x000f, 0x001f, 0x003f, 0x007f,
+        dw      0x00ff, 0x01ff, 0x03ff, 0x07ff,
+        dw      0x0fff, 0x1fff, 0x3fff, 0x7fff,
+        dw      0xffff
+
+align 64
+byte64_len_to_mask_table:
+        dq      0x0000000000000000, 0x0000000000000001
+        dq      0x0000000000000003, 0x0000000000000007
+        dq      0x000000000000000f, 0x000000000000001f
+        dq      0x000000000000003f, 0x000000000000007f
+        dq      0x00000000000000ff, 0x00000000000001ff
+        dq      0x00000000000003ff, 0x00000000000007ff
+        dq      0x0000000000000fff, 0x0000000000001fff
+        dq      0x0000000000003fff, 0x0000000000007fff
+        dq      0x000000000000ffff, 0x000000000001ffff
+        dq      0x000000000003ffff, 0x000000000007ffff
+        dq      0x00000000000fffff, 0x00000000001fffff
+        dq      0x00000000003fffff, 0x00000000007fffff
+        dq      0x0000000000ffffff, 0x0000000001ffffff
+        dq      0x0000000003ffffff, 0x0000000007ffffff
+        dq      0x000000000fffffff, 0x000000001fffffff
+        dq      0x000000003fffffff, 0x000000007fffffff
+        dq      0x00000000ffffffff, 0x00000001ffffffff
+        dq      0x00000003ffffffff, 0x00000007ffffffff
+        dq      0x0000000fffffffff, 0x0000001fffffffff
+        dq      0x0000003fffffffff, 0x0000007fffffffff
+        dq      0x000000ffffffffff, 0x000001ffffffffff
+        dq      0x000003ffffffffff, 0x000007ffffffffff
+        dq      0x00000fffffffffff, 0x00001fffffffffff
+        dq      0x00003fffffffffff, 0x00007fffffffffff
+        dq      0x0000ffffffffffff, 0x0001ffffffffffff
+        dq      0x0003ffffffffffff, 0x0007ffffffffffff
+        dq      0x000fffffffffffff, 0x001fffffffffffff
+        dq      0x003fffffffffffff, 0x007fffffffffffff
+        dq      0x00ffffffffffffff, 0x01ffffffffffffff
+        dq      0x03ffffffffffffff, 0x07ffffffffffffff
+        dq      0x0fffffffffffffff, 0x1fffffffffffffff
+        dq      0x3fffffffffffffff, 0x7fffffffffffffff
+        dq      0xffffffffffffffff
+
+align 64
+mask_out_top_block:
+        dq      0xffffffffffffffff, 0xffffffffffffffff
+        dq      0xffffffffffffffff, 0xffffffffffffffff
+        dq      0xffffffffffffffff, 0xffffffffffffffff
+        dq      0x0000000000000000, 0x0000000000000000
+
+section .text
+
+
+;;define the fields of gcm_data struct
+;typedef struct gcm_data
+;{
+;        u8 expanded_keys[16*15];
+;        u8 shifted_hkey_1[16];  // store HashKey <<1 mod poly here
+;        u8 shifted_hkey_2[16];  // store HashKey^2 <<1 mod poly here
+;        u8 shifted_hkey_3[16];  // store HashKey^3 <<1 mod poly here
+;        u8 shifted_hkey_4[16];  // store HashKey^4 <<1 mod poly here
+;        u8 shifted_hkey_5[16];  // store HashKey^5 <<1 mod poly here
+;        u8 shifted_hkey_6[16];  // store HashKey^6 <<1 mod poly here
+;        u8 shifted_hkey_7[16];  // store HashKey^7 <<1 mod poly here
+;        u8 shifted_hkey_8[16];  // store HashKey^8 <<1 mod poly here
+;        u8 shifted_hkey_1_k[16];  // store XOR of High 64 bits and Low 64 bits of  HashKey <<1 mod poly here (for Karatsuba purposes)
+;        u8 shifted_hkey_2_k[16];  // store XOR of High 64 bits and Low 64 bits of  HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+;        u8 shifted_hkey_3_k[16];  // store XOR of High 64 bits and Low 64 bits of  HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+;        u8 shifted_hkey_4_k[16];  // store XOR of High 64 bits and Low 64 bits of  HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+;        u8 shifted_hkey_5_k[16];  // store XOR of High 64 bits and Low 64 bits of  HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+;        u8 shifted_hkey_6_k[16];  // store XOR of High 64 bits and Low 64 bits of  HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+;        u8 shifted_hkey_7_k[16];  // store XOR of High 64 bits and Low 64 bits of  HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+;        u8 shifted_hkey_8_k[16];  // store XOR of High 64 bits and Low 64 bits of  HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+;} gcm_data;
+
+%ifndef GCM_KEYS_VAES_AVX512_INCLUDED
+%define HashKey         16*15    ; store HashKey <<1 mod poly here
+%define HashKey_1       16*15    ; store HashKey <<1 mod poly here
+%define HashKey_2       16*16    ; store HashKey^2 <<1 mod poly here
+%define HashKey_3       16*17    ; store HashKey^3 <<1 mod poly here
+%define HashKey_4       16*18    ; store HashKey^4 <<1 mod poly here
+%define HashKey_5       16*19    ; store HashKey^5 <<1 mod poly here
+%define HashKey_6       16*20    ; store HashKey^6 <<1 mod poly here
+%define HashKey_7       16*21    ; store HashKey^7 <<1 mod poly here
+%define HashKey_8       16*22    ; store HashKey^8 <<1 mod poly here
+%define HashKey_k       16*23    ; store XOR of High 64 bits and Low 64 bits of  HashKey <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_2_k     16*24    ; store XOR of High 64 bits and Low 64 bits of  HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_3_k     16*25   ; store XOR of High 64 bits and Low 64 bits of  HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_4_k     16*26   ; store XOR of High 64 bits and Low 64 bits of  HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_5_k     16*27   ; store XOR of High 64 bits and Low 64 bits of  HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_6_k     16*28   ; store XOR of High 64 bits and Low 64 bits of  HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_7_k     16*29   ; store XOR of High 64 bits and Low 64 bits of  HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_8_k     16*30   ; store XOR of High 64 bits and Low 64 bits of  HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+%endif
+
+%define AadHash		16*0	; store current Hash of data which has been input
+%define AadLen		16*1	; store length of input data which will not be encrypted or decrypted
+%define InLen		(16*1)+8 ; store length of input data which will be encrypted or decrypted
+%define PBlockEncKey	16*2	; encryption key for the partial block at the end of the previous update
+%define OrigIV		16*3	; input IV
+%define CurCount	16*4	; Current counter for generation of encryption key
+%define PBlockLen	16*5	; length of partial block at the end of the previous update
+
+%define reg(q) xmm %+ q
+%define arg(x) [r14 + STACK_OFFSET + 8*x]
+
+
+
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+    %xdefine arg1 rcx
+    %xdefine arg2 rdx
+    %xdefine arg3 r8
+    %xdefine arg4 r9
+    %xdefine arg5 rsi ;[r14 + STACK_OFFSET + 8*5] - need push and load
+    %xdefine arg6 [r14 + STACK_OFFSET + 8*6]
+    %xdefine arg7 [r14 + STACK_OFFSET + 8*7]
+    %xdefine arg8 [r14 + STACK_OFFSET + 8*8]
+    %xdefine arg9 [r14 + STACK_OFFSET + 8*9]
+    %xdefine arg10 [r14 + STACK_OFFSET + 8*10]
+
+%else
+    %xdefine arg1 rdi
+    %xdefine arg2 rsi
+    %xdefine arg3 rdx
+    %xdefine arg4 rcx
+    %xdefine arg5 r8
+    %xdefine arg6 r9
+    %xdefine arg7 [r14 + STACK_OFFSET + 8*1]
+    %xdefine arg8 [r14 + STACK_OFFSET + 8*2]
+    %xdefine arg9 [r14 + STACK_OFFSET + 8*3]
+    %xdefine arg10 [r14 + STACK_OFFSET + 8*4]
+%endif
+
+%ifdef NT_LDST
+	%define NT_LD
+	%define NT_ST
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NT_LD
+	%define	XLDR	 movntdqa
+	%define	VXLDR	 vmovntdqa
+	%define	VX512LDR vmovntdqa
+%else
+	%define	XLDR	 movdqu
+	%define	VXLDR	 vmovdqu
+	%define	VX512LDR vmovdqu8
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NT_ST
+	%define	XSTR	 movntdq
+	%define	VXSTR	 vmovntdq
+	%define	VX512STR vmovntdq
+%else
+	%define	XSTR	 movdqu
+	%define	VXSTR	 vmovdqu
+	%define	VX512STR vmovdqu8
+%endif
+
+%endif ; GCM_DEFINES_ASM_INCLUDED
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_keys_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_keys_vaes_avx512.asm
new file mode 100644
index 000000000..fd8aa05a6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_keys_vaes_avx512.asm
@@ -0,0 +1,233 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef GCM_KEYS_VAES_AVX512_INCLUDED
+%define GCM_KEYS_VAES_AVX512_INCLUDED
+
+;; Define the fields of gcm_key_data struct:
+;; uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS];
+;; uint8_t shifted_hkey_9_128[GCM_ENC_KEY_LEN * (128 - 8)];
+;; uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // HashKey^8 <<1 mod poly
+;; uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // HashKey^7 <<1 mod poly
+;; uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // HashKey^6 <<1 mod poly
+;; uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // HashKey^5 <<1 mod poly
+;; uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // HashKey^4 <<1 mod poly
+;; uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // HashKey^3 <<1 mod poly
+;; uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // HashKey^2 <<1 mod poly
+;; uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // HashKey   <<1 mod poly
+
+%ifdef GCM_BIG_DATA
+;;
+;; Key structure holds up to 128 ghash keys
+;;
+%define HashKey_128     (16*15)   ; HashKey^128 <<1 mod poly
+%define HashKey_127     (16*16)   ; HashKey^127 <<1 mod poly
+%define HashKey_126     (16*17)   ; HashKey^126 <<1 mod poly
+%define HashKey_125     (16*18)   ; HashKey^125 <<1 mod poly
+%define HashKey_124     (16*19)   ; HashKey^124 <<1 mod poly
+%define HashKey_123     (16*20)   ; HashKey^123 <<1 mod poly
+%define HashKey_122     (16*21)   ; HashKey^122 <<1 mod poly
+%define HashKey_121     (16*22)   ; HashKey^121 <<1 mod poly
+%define HashKey_120     (16*23)   ; HashKey^120 <<1 mod poly
+%define HashKey_119     (16*24)   ; HashKey^119 <<1 mod poly
+%define HashKey_118     (16*25)   ; HashKey^118 <<1 mod poly
+%define HashKey_117     (16*26)   ; HashKey^117 <<1 mod poly
+%define HashKey_116     (16*27)   ; HashKey^116 <<1 mod poly
+%define HashKey_115     (16*28)   ; HashKey^115 <<1 mod poly
+%define HashKey_114     (16*29)   ; HashKey^114 <<1 mod poly
+%define HashKey_113     (16*30)   ; HashKey^113 <<1 mod poly
+%define HashKey_112     (16*31)   ; HashKey^112 <<1 mod poly
+%define HashKey_111     (16*32)   ; HashKey^111 <<1 mod poly
+%define HashKey_110     (16*33)   ; HashKey^110 <<1 mod poly
+%define HashKey_109     (16*34)   ; HashKey^109 <<1 mod poly
+%define HashKey_108     (16*35)   ; HashKey^108 <<1 mod poly
+%define HashKey_107     (16*36)   ; HashKey^107 <<1 mod poly
+%define HashKey_106     (16*37)   ; HashKey^106 <<1 mod poly
+%define HashKey_105     (16*38)   ; HashKey^105 <<1 mod poly
+%define HashKey_104     (16*39)   ; HashKey^104 <<1 mod poly
+%define HashKey_103     (16*40)   ; HashKey^103 <<1 mod poly
+%define HashKey_102     (16*41)   ; HashKey^102 <<1 mod poly
+%define HashKey_101     (16*42)   ; HashKey^101 <<1 mod poly
+%define HashKey_100     (16*43)   ; HashKey^100 <<1 mod poly
+%define HashKey_99      (16*44)   ; HashKey^99 <<1 mod poly
+%define HashKey_98      (16*45)   ; HashKey^98 <<1 mod poly
+%define HashKey_97      (16*46)   ; HashKey^97 <<1 mod poly
+%define HashKey_96      (16*47)   ; HashKey^96 <<1 mod poly
+%define HashKey_95      (16*48)   ; HashKey^95 <<1 mod poly
+%define HashKey_94      (16*49)   ; HashKey^94 <<1 mod poly
+%define HashKey_93      (16*50)   ; HashKey^93 <<1 mod poly
+%define HashKey_92      (16*51)   ; HashKey^92 <<1 mod poly
+%define HashKey_91      (16*52)   ; HashKey^91 <<1 mod poly
+%define HashKey_90      (16*53)   ; HashKey^90 <<1 mod poly
+%define HashKey_89      (16*54)   ; HashKey^89 <<1 mod poly
+%define HashKey_88      (16*55)   ; HashKey^88 <<1 mod poly
+%define HashKey_87      (16*56)   ; HashKey^87 <<1 mod poly
+%define HashKey_86      (16*57)   ; HashKey^86 <<1 mod poly
+%define HashKey_85      (16*58)   ; HashKey^85 <<1 mod poly
+%define HashKey_84      (16*59)   ; HashKey^84 <<1 mod poly
+%define HashKey_83      (16*60)   ; HashKey^83 <<1 mod poly
+%define HashKey_82      (16*61)   ; HashKey^82 <<1 mod poly
+%define HashKey_81      (16*62)   ; HashKey^81 <<1 mod poly
+%define HashKey_80      (16*63)   ; HashKey^80 <<1 mod poly
+%define HashKey_79      (16*64)   ; HashKey^79 <<1 mod poly
+%define HashKey_78      (16*65)   ; HashKey^78 <<1 mod poly
+%define HashKey_77      (16*66)   ; HashKey^77 <<1 mod poly
+%define HashKey_76      (16*67)   ; HashKey^76 <<1 mod poly
+%define HashKey_75      (16*68)   ; HashKey^75 <<1 mod poly
+%define HashKey_74      (16*69)   ; HashKey^74 <<1 mod poly
+%define HashKey_73      (16*70)   ; HashKey^73 <<1 mod poly
+%define HashKey_72      (16*71)   ; HashKey^72 <<1 mod poly
+%define HashKey_71      (16*72)   ; HashKey^71 <<1 mod poly
+%define HashKey_70      (16*73)   ; HashKey^70 <<1 mod poly
+%define HashKey_69      (16*74)   ; HashKey^69 <<1 mod poly
+%define HashKey_68      (16*75)   ; HashKey^68 <<1 mod poly
+%define HashKey_67      (16*76)   ; HashKey^67 <<1 mod poly
+%define HashKey_66      (16*77)   ; HashKey^66 <<1 mod poly
+%define HashKey_65      (16*78)   ; HashKey^65 <<1 mod poly
+%define HashKey_64      (16*79)   ; HashKey^64 <<1 mod poly
+%define HashKey_63      (16*80)   ; HashKey^63 <<1 mod poly
+%define HashKey_62      (16*81)   ; HashKey^62 <<1 mod poly
+%define HashKey_61      (16*82)   ; HashKey^61 <<1 mod poly
+%define HashKey_60      (16*83)   ; HashKey^60 <<1 mod poly
+%define HashKey_59      (16*84)   ; HashKey^59 <<1 mod poly
+%define HashKey_58      (16*85)   ; HashKey^58 <<1 mod poly
+%define HashKey_57      (16*86)   ; HashKey^57 <<1 mod poly
+%define HashKey_56      (16*87)   ; HashKey^56 <<1 mod poly
+%define HashKey_55      (16*88)   ; HashKey^55 <<1 mod poly
+%define HashKey_54      (16*89)   ; HashKey^54 <<1 mod poly
+%define HashKey_53      (16*90)   ; HashKey^53 <<1 mod poly
+%define HashKey_52      (16*91)   ; HashKey^52 <<1 mod poly
+%define HashKey_51      (16*92)   ; HashKey^51 <<1 mod poly
+%define HashKey_50      (16*93)   ; HashKey^50 <<1 mod poly
+%define HashKey_49      (16*94)   ; HashKey^49 <<1 mod poly
+%define HashKey_48      (16*95)   ; HashKey^48 <<1 mod poly
+%define HashKey_47      (16*96)   ; HashKey^47 <<1 mod poly
+%define HashKey_46      (16*97)   ; HashKey^46 <<1 mod poly
+%define HashKey_45      (16*98)   ; HashKey^45 <<1 mod poly
+%define HashKey_44      (16*99)   ; HashKey^44 <<1 mod poly
+%define HashKey_43      (16*100)  ; HashKey^43 <<1 mod poly
+%define HashKey_42      (16*101)  ; HashKey^42 <<1 mod poly
+%define HashKey_41      (16*102)  ; HashKey^41 <<1 mod poly
+%define HashKey_40      (16*103)  ; HashKey^40 <<1 mod poly
+%define HashKey_39      (16*104)  ; HashKey^39 <<1 mod poly
+%define HashKey_38      (16*105)  ; HashKey^38 <<1 mod poly
+%define HashKey_37      (16*106)  ; HashKey^37 <<1 mod poly
+%define HashKey_36      (16*107)  ; HashKey^36 <<1 mod poly
+%define HashKey_35      (16*108)  ; HashKey^35 <<1 mod poly
+%define HashKey_34      (16*109)  ; HashKey^34 <<1 mod poly
+%define HashKey_33      (16*110)  ; HashKey^33 <<1 mod poly
+%define HashKey_32      (16*111)  ; HashKey^32 <<1 mod poly
+%define HashKey_31      (16*112)  ; HashKey^31 <<1 mod poly
+%define HashKey_30      (16*113)  ; HashKey^30 <<1 mod poly
+%define HashKey_29      (16*114)  ; HashKey^29 <<1 mod poly
+%define HashKey_28      (16*115)  ; HashKey^28 <<1 mod poly
+%define HashKey_27      (16*116)  ; HashKey^27 <<1 mod poly
+%define HashKey_26      (16*117)  ; HashKey^26 <<1 mod poly
+%define HashKey_25      (16*118)  ; HashKey^25 <<1 mod poly
+%define HashKey_24      (16*119)  ; HashKey^24 <<1 mod poly
+%define HashKey_23      (16*120)  ; HashKey^23 <<1 mod poly
+%define HashKey_22      (16*121)  ; HashKey^22 <<1 mod poly
+%define HashKey_21      (16*122)  ; HashKey^21 <<1 mod poly
+%define HashKey_20      (16*123)  ; HashKey^20 <<1 mod poly
+%define HashKey_19      (16*124)  ; HashKey^19 <<1 mod poly
+%define HashKey_18      (16*125)  ; HashKey^18 <<1 mod poly
+%define HashKey_17      (16*126)  ; HashKey^17 <<1 mod poly
+%define HashKey_16      (16*127)  ; HashKey^16 <<1 mod poly
+%define HashKey_15      (16*128)  ; HashKey^15 <<1 mod poly
+%define HashKey_14      (16*129)  ; HashKey^14 <<1 mod poly
+%define HashKey_13      (16*130)  ; HashKey^13 <<1 mod poly
+%define HashKey_12      (16*131)  ; HashKey^12 <<1 mod poly
+%define HashKey_11      (16*132)  ; HashKey^11 <<1 mod poly
+%define HashKey_10      (16*133)  ; HashKey^10 <<1 mod poly
+%define HashKey_9       (16*134)  ; HashKey^9 <<1 mod poly
+%define HashKey_8       (16*135)  ; HashKey^8 <<1 mod poly
+%define HashKey_7       (16*136)  ; HashKey^7 <<1 mod poly
+%define HashKey_6       (16*137)  ; HashKey^6 <<1 mod poly
+%define HashKey_5       (16*138)  ; HashKey^5 <<1 mod poly
+%define HashKey_4       (16*139)  ; HashKey^4 <<1 mod poly
+%define HashKey_3       (16*140)  ; HashKey^3 <<1 mod poly
+%define HashKey_2       (16*141)  ; HashKey^2 <<1 mod poly
+%define HashKey_1       (16*142)  ; HashKey <<1 mod poly
+%define HashKey         (16*142)  ; HashKey <<1 mod poly
+%else
+;;
+;; Key structure holds up to 48 ghash keys
+;;
+%define HashKey_48      (16*15)   ; HashKey^48 <<1 mod poly
+%define HashKey_47      (16*16)   ; HashKey^47 <<1 mod poly
+%define HashKey_46      (16*17)   ; HashKey^46 <<1 mod poly
+%define HashKey_45      (16*18)   ; HashKey^45 <<1 mod poly
+%define HashKey_44      (16*19)   ; HashKey^44 <<1 mod poly
+%define HashKey_43      (16*20)   ; HashKey^43 <<1 mod poly
+%define HashKey_42      (16*21)   ; HashKey^42 <<1 mod poly
+%define HashKey_41      (16*22)   ; HashKey^41 <<1 mod poly
+%define HashKey_40      (16*23)   ; HashKey^40 <<1 mod poly
+%define HashKey_39      (16*24)   ; HashKey^39 <<1 mod poly
+%define HashKey_38      (16*25)   ; HashKey^38 <<1 mod poly
+%define HashKey_37      (16*26)   ; HashKey^37 <<1 mod poly
+%define HashKey_36      (16*27)   ; HashKey^36 <<1 mod poly
+%define HashKey_35      (16*28)   ; HashKey^35 <<1 mod poly
+%define HashKey_34      (16*29)   ; HashKey^34 <<1 mod poly
+%define HashKey_33      (16*30)   ; HashKey^33 <<1 mod poly
+%define HashKey_32      (16*31)   ; HashKey^32 <<1 mod poly
+%define HashKey_31      (16*32)   ; HashKey^31 <<1 mod poly
+%define HashKey_30      (16*33)   ; HashKey^30 <<1 mod poly
+%define HashKey_29      (16*34)   ; HashKey^29 <<1 mod poly
+%define HashKey_28      (16*35)   ; HashKey^28 <<1 mod poly
+%define HashKey_27      (16*36)   ; HashKey^27 <<1 mod poly
+%define HashKey_26      (16*37)   ; HashKey^26 <<1 mod poly
+%define HashKey_25      (16*38)   ; HashKey^25 <<1 mod poly
+%define HashKey_24      (16*39)   ; HashKey^24 <<1 mod poly
+%define HashKey_23      (16*40)   ; HashKey^23 <<1 mod poly
+%define HashKey_22      (16*41)   ; HashKey^22 <<1 mod poly
+%define HashKey_21      (16*42)   ; HashKey^21 <<1 mod poly
+%define HashKey_20      (16*43)   ; HashKey^20 <<1 mod poly
+%define HashKey_19      (16*44)   ; HashKey^19 <<1 mod poly
+%define HashKey_18      (16*45)   ; HashKey^18 <<1 mod poly
+%define HashKey_17      (16*46)   ; HashKey^17 <<1 mod poly
+%define HashKey_16      (16*47)   ; HashKey^16 <<1 mod poly
+%define HashKey_15      (16*48)   ; HashKey^15 <<1 mod poly
+%define HashKey_14      (16*49)   ; HashKey^14 <<1 mod poly
+%define HashKey_13      (16*50)   ; HashKey^13 <<1 mod poly
+%define HashKey_12      (16*51)   ; HashKey^12 <<1 mod poly
+%define HashKey_11      (16*52)   ; HashKey^11 <<1 mod poly
+%define HashKey_10      (16*53)   ; HashKey^10 <<1 mod poly
+%define HashKey_9       (16*54)   ; HashKey^9 <<1 mod poly
+%define HashKey_8       (16*55)   ; HashKey^8 <<1 mod poly
+%define HashKey_7       (16*56)   ; HashKey^7 <<1 mod poly
+%define HashKey_6       (16*57)   ; HashKey^6 <<1 mod poly
+%define HashKey_5       (16*58)   ; HashKey^5 <<1 mod poly
+%define HashKey_4       (16*59)   ; HashKey^4 <<1 mod poly
+%define HashKey_3       (16*60)   ; HashKey^3 <<1 mod poly
+%define HashKey_2       (16*61)   ; HashKey^2 <<1 mod poly
+%define HashKey_1       (16*62)   ; HashKey <<1 mod poly
+%define HashKey         (16*62)   ; HashKey <<1 mod poly
+%endif  ; !GCM_BIG_DATA
+
+%endif ; GCM_KEYS_VAES_AVX512_INCLUDED
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm
new file mode 100644
index 000000000..6f71e43fa
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm
@@ -0,0 +1,184 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%include "reg_sizes.asm"
+
+extern aes_gcm_init_128_sse
+extern aes_gcm_init_128_avx_gen4
+extern aes_gcm_init_128_avx_gen2
+
+extern aes_gcm_enc_128_sse
+extern aes_gcm_enc_128_avx_gen4
+extern aes_gcm_enc_128_avx_gen2
+extern aes_gcm_enc_128_update_sse
+extern aes_gcm_enc_128_update_avx_gen4
+extern aes_gcm_enc_128_update_avx_gen2
+extern aes_gcm_enc_128_finalize_sse
+extern aes_gcm_enc_128_finalize_avx_gen4
+extern aes_gcm_enc_128_finalize_avx_gen2
+
+extern aes_gcm_dec_128_sse
+extern aes_gcm_dec_128_avx_gen4
+extern aes_gcm_dec_128_avx_gen2
+extern aes_gcm_dec_128_update_sse
+extern aes_gcm_dec_128_update_avx_gen4
+extern aes_gcm_dec_128_update_avx_gen2
+extern aes_gcm_dec_128_finalize_sse
+extern aes_gcm_dec_128_finalize_avx_gen4
+extern aes_gcm_dec_128_finalize_avx_gen2
+
+extern aes_gcm_precomp_128_sse
+extern aes_gcm_precomp_128_avx_gen4
+extern aes_gcm_precomp_128_avx_gen2
+
+extern aes_gcm_init_256_sse
+extern aes_gcm_init_256_avx_gen4
+extern aes_gcm_init_256_avx_gen2
+
+extern aes_gcm_enc_256_sse
+extern aes_gcm_enc_256_avx_gen4
+extern aes_gcm_enc_256_avx_gen2
+extern aes_gcm_enc_256_update_sse
+extern aes_gcm_enc_256_update_avx_gen4
+extern aes_gcm_enc_256_update_avx_gen2
+extern aes_gcm_enc_256_finalize_sse
+extern aes_gcm_enc_256_finalize_avx_gen4
+extern aes_gcm_enc_256_finalize_avx_gen2
+
+extern aes_gcm_dec_256_sse
+extern aes_gcm_dec_256_avx_gen4
+extern aes_gcm_dec_256_avx_gen2
+extern aes_gcm_dec_256_update_sse
+extern aes_gcm_dec_256_update_avx_gen4
+extern aes_gcm_dec_256_update_avx_gen2
+extern aes_gcm_dec_256_finalize_sse
+extern aes_gcm_dec_256_finalize_avx_gen4
+extern aes_gcm_dec_256_finalize_avx_gen2
+
+extern aes_gcm_precomp_256_sse
+extern aes_gcm_precomp_256_avx_gen4
+extern aes_gcm_precomp_256_avx_gen2
+
+%if (AS_FEATURE_LEVEL) >= 10
+extern aes_gcm_precomp_128_vaes_avx512
+extern aes_gcm_init_128_vaes_avx512
+extern aes_gcm_enc_128_update_vaes_avx512
+extern aes_gcm_dec_128_update_vaes_avx512
+extern aes_gcm_enc_128_finalize_vaes_avx512
+extern aes_gcm_dec_128_finalize_vaes_avx512
+extern aes_gcm_enc_128_vaes_avx512
+extern aes_gcm_dec_128_vaes_avx512
+
+extern aes_gcm_precomp_256_vaes_avx512
+extern aes_gcm_init_256_vaes_avx512
+extern aes_gcm_enc_256_update_vaes_avx512
+extern aes_gcm_dec_256_update_vaes_avx512
+extern aes_gcm_enc_256_finalize_vaes_avx512
+extern aes_gcm_dec_256_finalize_vaes_avx512
+extern aes_gcm_enc_256_vaes_avx512
+extern aes_gcm_dec_256_vaes_avx512
+%endif
+
+section .text
+
+%include "multibinary.asm"
+
+;;;;
+; instantiate aesni_gcm interfaces init, enc, enc_update, enc_finalize, dec, dec_update, dec_finalize and precomp
+;;;;
+mbin_interface     aes_gcm_init_128
+mbin_dispatch_init7 aes_gcm_init_128, aes_gcm_init_128_sse, aes_gcm_init_128_sse, aes_gcm_init_128_avx_gen2, aes_gcm_init_128_avx_gen4, aes_gcm_init_128_avx_gen4, aes_gcm_init_128_vaes_avx512
+
+mbin_interface     aes_gcm_enc_128
+mbin_dispatch_init7 aes_gcm_enc_128, aes_gcm_enc_128_sse, aes_gcm_enc_128_sse, aes_gcm_enc_128_avx_gen2, aes_gcm_enc_128_avx_gen4, aes_gcm_enc_128_avx_gen4, aes_gcm_enc_128_vaes_avx512
+
+mbin_interface     aes_gcm_enc_128_update
+mbin_dispatch_init7 aes_gcm_enc_128_update, aes_gcm_enc_128_update_sse, aes_gcm_enc_128_update_sse, aes_gcm_enc_128_update_avx_gen2, aes_gcm_enc_128_update_avx_gen4, aes_gcm_enc_128_update_avx_gen4, aes_gcm_enc_128_update_vaes_avx512
+
+mbin_interface     aes_gcm_enc_128_finalize
+mbin_dispatch_init7 aes_gcm_enc_128_finalize, aes_gcm_enc_128_finalize_sse, aes_gcm_enc_128_finalize_sse, aes_gcm_enc_128_finalize_avx_gen2, aes_gcm_enc_128_finalize_avx_gen4, aes_gcm_enc_128_finalize_avx_gen4, aes_gcm_enc_128_finalize_vaes_avx512
+
+mbin_interface     aes_gcm_dec_128
+mbin_dispatch_init7 aes_gcm_dec_128, aes_gcm_dec_128_sse, aes_gcm_dec_128_sse, aes_gcm_dec_128_avx_gen2, aes_gcm_dec_128_avx_gen4, aes_gcm_dec_128_avx_gen4, aes_gcm_dec_128_vaes_avx512
+
+mbin_interface     aes_gcm_dec_128_update
+mbin_dispatch_init7 aes_gcm_dec_128_update, aes_gcm_dec_128_update_sse, aes_gcm_dec_128_update_sse, aes_gcm_dec_128_update_avx_gen2, aes_gcm_dec_128_update_avx_gen4, aes_gcm_dec_128_update_avx_gen4, aes_gcm_dec_128_update_vaes_avx512
+
+mbin_interface     aes_gcm_dec_128_finalize
+mbin_dispatch_init7 aes_gcm_dec_128_finalize, aes_gcm_dec_128_finalize_sse, aes_gcm_dec_128_finalize_sse, aes_gcm_dec_128_finalize_avx_gen2, aes_gcm_dec_128_finalize_avx_gen4, aes_gcm_dec_128_finalize_avx_gen4, aes_gcm_dec_128_finalize_vaes_avx512
+
+mbin_interface     aes_gcm_precomp_128
+mbin_dispatch_init7 aes_gcm_precomp_128, aes_gcm_precomp_128_sse, aes_gcm_precomp_128_sse, aes_gcm_precomp_128_avx_gen2, aes_gcm_precomp_128_avx_gen4, aes_gcm_precomp_128_avx_gen4, aes_gcm_precomp_128_vaes_avx512
+
+;;;;
+; instantiate aesni_gcm interfaces init, enc, enc_update, enc_finalize, dec, dec_update, dec_finalize and precomp
+;;;;
+mbin_interface     aes_gcm_init_256
+mbin_dispatch_init7 aes_gcm_init_256, aes_gcm_init_256_sse, aes_gcm_init_256_sse, aes_gcm_init_256_avx_gen2, aes_gcm_init_256_avx_gen4, aes_gcm_init_256_avx_gen4, aes_gcm_init_256_vaes_avx512
+
+mbin_interface     aes_gcm_enc_256
+mbin_dispatch_init7 aes_gcm_enc_256, aes_gcm_enc_256_sse, aes_gcm_enc_256_sse, aes_gcm_enc_256_avx_gen2, aes_gcm_enc_256_avx_gen4, aes_gcm_enc_256_avx_gen4, aes_gcm_enc_256_vaes_avx512
+
+mbin_interface     aes_gcm_enc_256_update
+mbin_dispatch_init7 aes_gcm_enc_256_update, aes_gcm_enc_256_update_sse, aes_gcm_enc_256_update_sse, aes_gcm_enc_256_update_avx_gen2, aes_gcm_enc_256_update_avx_gen4, aes_gcm_enc_256_update_avx_gen4, aes_gcm_enc_256_update_vaes_avx512
+
+mbin_interface     aes_gcm_enc_256_finalize
+mbin_dispatch_init7 aes_gcm_enc_256_finalize, aes_gcm_enc_256_finalize_sse, aes_gcm_enc_256_finalize_sse, aes_gcm_enc_256_finalize_avx_gen2, aes_gcm_enc_256_finalize_avx_gen4, aes_gcm_enc_256_finalize_avx_gen4, aes_gcm_enc_256_finalize_vaes_avx512
+
+mbin_interface     aes_gcm_dec_256
+mbin_dispatch_init7 aes_gcm_dec_256, aes_gcm_dec_256_sse, aes_gcm_dec_256_sse, aes_gcm_dec_256_avx_gen2, aes_gcm_dec_256_avx_gen4, aes_gcm_dec_256_avx_gen4, aes_gcm_dec_256_vaes_avx512
+
+mbin_interface     aes_gcm_dec_256_update
+mbin_dispatch_init7 aes_gcm_dec_256_update, aes_gcm_dec_256_update_sse, aes_gcm_dec_256_update_sse, aes_gcm_dec_256_update_avx_gen2, aes_gcm_dec_256_update_avx_gen4, aes_gcm_dec_256_update_avx_gen4, aes_gcm_dec_256_update_vaes_avx512
+
+mbin_interface     aes_gcm_dec_256_finalize
+mbin_dispatch_init7 aes_gcm_dec_256_finalize, aes_gcm_dec_256_finalize_sse, aes_gcm_dec_256_finalize_sse, aes_gcm_dec_256_finalize_avx_gen2, aes_gcm_dec_256_finalize_avx_gen4, aes_gcm_dec_256_finalize_avx_gen4, aes_gcm_dec_256_finalize_vaes_avx512
+
+mbin_interface     aes_gcm_precomp_256
+mbin_dispatch_init7 aes_gcm_precomp_256, aes_gcm_precomp_256_sse, aes_gcm_precomp_256_sse, aes_gcm_precomp_256_avx_gen2, aes_gcm_precomp_256_avx_gen4, aes_gcm_precomp_256_avx_gen4, aes_gcm_precomp_256_vaes_avx512
+
+
+;;;       func				core, ver, snum
+slversion aes_gcm_enc_128,		00,   00,  02c0
+slversion aes_gcm_dec_128,		00,   00,  02c1
+slversion aes_gcm_init_128,		00,   00,  02c2
+slversion aes_gcm_enc_128_update,	00,   00,  02c3
+slversion aes_gcm_dec_128_update,	00,   00,  02c4
+slversion aes_gcm_enc_128_finalize,	00,   00,  02c5
+slversion aes_gcm_dec_128_finalize,	00,   00,  02c6
+slversion aes_gcm_enc_256,		00,   00,  02d0
+slversion aes_gcm_dec_256,		00,   00,  02d1
+slversion aes_gcm_init_256,		00,   00,  02d2
+slversion aes_gcm_enc_256_update,	00,   00,  02d3
+slversion aes_gcm_dec_256_update,	00,   00,  02d4
+slversion aes_gcm_enc_256_finalize,	00,   00,  02d5
+slversion aes_gcm_dec_256_finalize,	00,   00,  02d6
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary_nt.asm
new file mode 100644
index 000000000..4c5083173
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary_nt.asm
@@ -0,0 +1,118 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%include "reg_sizes.asm"
+
+extern aes_gcm_enc_128_sse_nt
+extern aes_gcm_enc_128_avx_gen4_nt
+extern aes_gcm_enc_128_avx_gen2_nt
+extern aes_gcm_enc_128_update_sse_nt
+extern aes_gcm_enc_128_update_avx_gen4_nt
+extern aes_gcm_enc_128_update_avx_gen2_nt
+
+extern aes_gcm_dec_128_sse_nt
+extern aes_gcm_dec_128_avx_gen4_nt
+extern aes_gcm_dec_128_avx_gen2_nt
+extern aes_gcm_dec_128_update_sse_nt
+extern aes_gcm_dec_128_update_avx_gen4_nt
+extern aes_gcm_dec_128_update_avx_gen2_nt
+
+extern aes_gcm_enc_256_sse_nt
+extern aes_gcm_enc_256_avx_gen4_nt
+extern aes_gcm_enc_256_avx_gen2_nt
+extern aes_gcm_enc_256_update_sse_nt
+extern aes_gcm_enc_256_update_avx_gen4_nt
+extern aes_gcm_enc_256_update_avx_gen2_nt
+
+extern aes_gcm_dec_256_sse_nt
+extern aes_gcm_dec_256_avx_gen4_nt
+extern aes_gcm_dec_256_avx_gen2_nt
+extern aes_gcm_dec_256_update_sse_nt
+extern aes_gcm_dec_256_update_avx_gen4_nt
+extern aes_gcm_dec_256_update_avx_gen2_nt
+
+%if (AS_FEATURE_LEVEL) >= 10
+extern aes_gcm_enc_128_update_vaes_avx512_nt
+extern aes_gcm_dec_128_update_vaes_avx512_nt
+extern aes_gcm_enc_128_vaes_avx512_nt
+extern aes_gcm_dec_128_vaes_avx512_nt
+
+extern aes_gcm_enc_256_update_vaes_avx512_nt
+extern aes_gcm_dec_256_update_vaes_avx512_nt
+extern aes_gcm_enc_256_vaes_avx512_nt
+extern aes_gcm_dec_256_vaes_avx512_nt
+%endif
+
+section .text
+
+%include "multibinary.asm"
+
+;;;;
+; instantiate aes_gcm NT interfaces enc, enc_update, dec, dec_update
+;;;;
+mbin_interface     aes_gcm_enc_128_nt
+mbin_dispatch_init7 aes_gcm_enc_128_nt, aes_gcm_enc_128_sse_nt, aes_gcm_enc_128_sse_nt, aes_gcm_enc_128_avx_gen2_nt, aes_gcm_enc_128_avx_gen4_nt, aes_gcm_enc_128_avx_gen4_nt, aes_gcm_enc_128_vaes_avx512_nt
+
+mbin_interface     aes_gcm_enc_128_update_nt
+mbin_dispatch_init7 aes_gcm_enc_128_update_nt, aes_gcm_enc_128_update_sse_nt, aes_gcm_enc_128_update_sse_nt, aes_gcm_enc_128_update_avx_gen2_nt, aes_gcm_enc_128_update_avx_gen4_nt, aes_gcm_enc_128_update_avx_gen4_nt, aes_gcm_enc_128_update_vaes_avx512_nt
+
+mbin_interface     aes_gcm_dec_128_nt
+mbin_dispatch_init7 aes_gcm_dec_128_nt, aes_gcm_dec_128_sse_nt, aes_gcm_dec_128_sse_nt, aes_gcm_dec_128_avx_gen2_nt, aes_gcm_dec_128_avx_gen4_nt, aes_gcm_dec_128_avx_gen4_nt, aes_gcm_dec_128_vaes_avx512_nt
+
+mbin_interface     aes_gcm_dec_128_update_nt
+mbin_dispatch_init7 aes_gcm_dec_128_update_nt, aes_gcm_dec_128_update_sse_nt, aes_gcm_dec_128_update_sse_nt, aes_gcm_dec_128_update_avx_gen2_nt, aes_gcm_dec_128_update_avx_gen4_nt, aes_gcm_dec_128_update_avx_gen4_nt, aes_gcm_dec_128_update_vaes_avx512_nt
+
+;;;;
+; instantiate aesni_gcm interfaces init, enc, enc_update, enc_finalize, dec, dec_update, dec_finalize and precomp
+;;;;
+mbin_interface     aes_gcm_enc_256_nt
+mbin_dispatch_init7 aes_gcm_enc_256_nt, aes_gcm_enc_256_sse_nt, aes_gcm_enc_256_sse_nt, aes_gcm_enc_256_avx_gen2_nt, aes_gcm_enc_256_avx_gen4_nt, aes_gcm_enc_256_avx_gen4_nt, aes_gcm_enc_256_vaes_avx512_nt
+
+mbin_interface     aes_gcm_enc_256_update_nt
+mbin_dispatch_init7 aes_gcm_enc_256_update_nt, aes_gcm_enc_256_update_sse_nt, aes_gcm_enc_256_update_sse_nt, aes_gcm_enc_256_update_avx_gen2_nt, aes_gcm_enc_256_update_avx_gen4_nt, aes_gcm_enc_256_update_avx_gen4_nt, aes_gcm_enc_256_update_vaes_avx512_nt
+
+mbin_interface     aes_gcm_dec_256_nt
+mbin_dispatch_init7 aes_gcm_dec_256_nt, aes_gcm_dec_256_sse_nt, aes_gcm_dec_256_sse_nt, aes_gcm_dec_256_avx_gen2_nt, aes_gcm_dec_256_avx_gen4_nt, aes_gcm_dec_256_avx_gen4_nt, aes_gcm_dec_256_vaes_avx512_nt
+
+mbin_interface     aes_gcm_dec_256_update_nt
+mbin_dispatch_init7 aes_gcm_dec_256_update_nt, aes_gcm_dec_256_update_sse_nt, aes_gcm_dec_256_update_sse_nt, aes_gcm_dec_256_update_avx_gen2_nt, aes_gcm_dec_256_update_avx_gen4_nt, aes_gcm_dec_256_update_avx_gen4_nt, aes_gcm_dec_256_update_vaes_avx512_nt
+
+
+;;;       func				core, ver, snum
+slversion aes_gcm_enc_128_nt,		00,   00,  02e1
+slversion aes_gcm_dec_128_nt,		00,   00,  02e2
+slversion aes_gcm_enc_128_update_nt,	00,   00,  02e3
+slversion aes_gcm_dec_128_update_nt,	00,   00,  02e4
+slversion aes_gcm_enc_256_nt,		00,   00,  02e5
+slversion aes_gcm_dec_256_nt,		00,   00,  02e6
+slversion aes_gcm_enc_256_update_nt,	00,   00,  02e7
+slversion aes_gcm_dec_256_update_nt,	00,   00,  02e8
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_rand_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_rand_test.c
new file mode 100644
index 000000000..529d36b31
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_rand_test.c
@@ -0,0 +1,2038 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>		// for memcmp
+#include <aes_gcm.h>
+#include <openssl/sha.h>
+#include "gcm_vectors.h"
+#include "ossl_helper.h"
+#include "types.h"
+
+//#define GCM_VECTORS_VERBOSE
+//#define GCM_VECTORS_EXTRA_VERBOSE
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#ifndef RANDOMS
+# define RANDOMS  200
+#endif
+#ifndef TEST_LEN
+# define TEST_LEN  32*1024
+#endif
+#ifndef PAGE_LEN
+# define PAGE_LEN  (4*1024)
+#endif
+
+// NT versions require 64B alignment
+# define NT_ALIGNMENT (64)
+# define ALIGNMENT_MASK (~(NT_ALIGNMENT - 1))
+# define OFFSET_BASE_VALUE (NT_ALIGNMENT)
+#ifndef MAX_UNALIGNED
+# define MAX_UNALIGNED  (1)
+#endif
+
+void dump_table(char *title, uint8_t * table, uint8_t count)
+{
+	int i;
+	char const *space = "   ";
+
+	printf("%s%s => {\n", space, title);
+	for (i = 0; i < count; i++) {
+		if (0 == (i & 15))
+			printf("%s%s", space, space);
+		printf("%2x, ", table[i]);
+		if (15 == (i & 15))
+			printf("\n");
+
+	}
+	printf("%s}\n", space);
+}
+
+void dump_gcm_data(struct gcm_key_data *gkey)
+{
+#ifdef GCM_VECTORS_EXTRA_VERBOSE
+	printf("gcm_data {\n");
+	dump_table("expanded_keys", gkey->expanded_keys, (16 * 11));
+	dump_table("shifted_hkey_1", gkey->shifted_hkey_1, 16);
+	dump_table("shifted_hkey_2", gkey->shifted_hkey_2, 16);
+	dump_table("shifted_hkey_3", gkey->shifted_hkey_3, 16);
+	dump_table("shifted_hkey_4", gkey->shifted_hkey_4, 16);
+	dump_table("shifted_hkey_5", gkey->shifted_hkey_5, 16);
+	dump_table("shifted_hkey_6", gkey->shifted_hkey_6, 16);
+	dump_table("shifted_hkey_7", gkey->shifted_hkey_7, 16);
+	dump_table("shifted_hkey_8", gkey->shifted_hkey_8, 16);
+	dump_table("shifted_hkey_1_k", gkey->shifted_hkey_1_k, 16);
+	dump_table("shifted_hkey_2_k", gkey->shifted_hkey_2_k, 16);
+	dump_table("shifted_hkey_3_k", gkey->shifted_hkey_3_k, 16);
+	dump_table("shifted_hkey_4_k", gkey->shifted_hkey_4_k, 16);
+	dump_table("shifted_hkey_5_k", gkey->shifted_hkey_5_k, 16);
+	dump_table("shifted_hkey_6_k", gkey->shifted_hkey_6_k, 16);
+	dump_table("shifted_hkey_7_k", gkey->shifted_hkey_7_k, 16);
+	dump_table("shifted_hkey_8_k", gkey->shifted_hkey_8_k, 16);
+	printf("}\n");
+#endif //GCM_VECTORS_VERBOSE
+}
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+	int i;
+	for (i = 0; i < size; i++) {
+		*data++ = rand();
+	}
+}
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+	int mismatch;
+	int OK = 0;
+
+	mismatch = memcmp(test, expected, len);
+	if (mismatch) {
+		OK = 1;
+		printf("  expected results don't match %s \t\t", data_name);
+		{
+			uint64_t a;
+			for (a = 0; a < len; a++) {
+				if (test[a] != expected[a]) {
+					printf(" '%x' != '%x' at %lx of %lx\n",
+					       test[a], expected[a], a, len);
+					break;
+				}
+			}
+		}
+	}
+	return OK;
+}
+
+int check_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, gcm_vector * vector)
+{
+	uint8_t *pt_test = NULL;
+	uint8_t *ct_test = NULL;
+	uint8_t *o_ct_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *o_T_test = NULL;
+	uint64_t IV_alloc_len = 0;
+	int result;
+	int OK = 0;
+	int ret;
+
+#ifdef GCM_VECTORS_VERBOSE
+	printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+	       (int)vector->Klen,
+	       (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+	printf(".");
+#endif
+	// Allocate space for the calculated ciphertext
+	if (vector->Plen != 0) {
+		ret = posix_memalign((void **)&pt_test, 64, vector->Plen);
+		ret |= posix_memalign((void **)&ct_test, 64, vector->Plen);
+		ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen);
+		if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL)
+		    || (o_ct_test == NULL)) {
+			fprintf(stderr, "Can't allocate ciphertext memory\n");
+			return 1;
+		}
+	}
+	IV_alloc_len = vector->IVlen;
+
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	o_T_test = malloc(vector->Tlen);
+	if ((T_test == NULL) || (o_T_test == NULL)) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_128(vector->K, gkey);
+
+	////
+	// ISA-l Encrypt
+	////
+	aes_gcm_enc_128_nt(gkey, gctx, vector->C, vector->P, vector->Plen,
+			   IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	openssl_aes_gcm_enc(vector->K, vector->IV,
+			    vector->IVlen, vector->A, vector->Alen, o_T_test,
+			    vector->Tlen, vector->P, vector->Plen, o_ct_test);
+	OK |=
+	    check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+	memcpy(ct_test, vector->C, vector->Plen);
+	memcpy(pt_test, vector->P, vector->Plen);
+	memset(vector->P, 0, vector->Plen);
+	memcpy(T_test, vector->T, vector->Tlen);
+	memset(vector->T, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+	aes_gcm_dec_128_nt(gkey, gctx, vector->P, vector->C, vector->Plen,
+			   IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	memset(vector->P, 0, vector->Plen);
+	aes_gcm_dec_128_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	result =
+	    openssl_aes_gcm_dec(vector->K, vector->IV,
+				vector->IVlen, vector->A, vector->Alen,
+				vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+	if (-1 == result)
+		printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+	OK |= (-1 == result);
+	free(T_test);
+	free(o_T_test);
+	free(IV_c);
+	if (vector->Plen != 0) {
+		aligned_free(pt_test);
+		aligned_free(ct_test);
+		aligned_free(o_ct_test);
+	}
+
+	return OK;
+}
+
+int check_strm_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+		      gcm_vector * vector, int test_len)
+{
+	uint8_t *pt_test = NULL;
+	uint8_t *ct_test = NULL;
+	uint8_t *o_ct_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *o_T_test = NULL;
+	uint8_t *stream = NULL;
+	uint64_t IV_alloc_len = 0;
+	int result;
+	int OK = 0;
+	uint32_t last_break;
+	int i, ret;
+	uint8_t *rand_data = NULL;
+	uint64_t length;
+
+	rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+	printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+	       (int)vector->Klen,
+	       (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+	printf(".");
+#endif
+	// Allocate space for the calculated ciphertext
+	if (vector->Plen != 0) {
+		ret = posix_memalign((void **)&pt_test, 64, vector->Plen);
+		ret |= posix_memalign((void **)&ct_test, 64, vector->Plen);
+		ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen);
+		if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL)
+		    || (o_ct_test == NULL)) {
+			fprintf(stderr, "Can't allocate ciphertext memory\n");
+			return 1;
+		}
+	}
+	IV_alloc_len = vector->IVlen;
+	// Allocate space for the calculated ciphertext
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	o_T_test = malloc(vector->Tlen);
+	if ((T_test == NULL) || (o_T_test == NULL)) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_128(vector->K, gkey);
+
+	////
+	// ISA-l Encrypt
+	////
+	aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+
+	last_break = 0;
+	i = (rand() % test_len / 8) & ALIGNMENT_MASK;
+	while (i < (vector->Plen)) {
+		if (i - last_break != 0) {
+			ret = posix_memalign((void **)&stream, 64, (i - last_break));
+			if ((ret != 0) || (stream == NULL)) {
+				OK = 1;
+				fprintf(stderr, "posix_memalign failed\n");
+				break;
+			}
+			memcpy(stream, vector->P + last_break, i - last_break);
+		}
+		aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, stream,
+					  i - last_break);
+		if (i - last_break != 0)
+			aligned_free(stream);
+
+		if (rand() % 1024 == 0) {
+			length = rand() % 100;
+			mk_rand_data(rand_data, length);
+			SHA1(rand_data, length, rand_data);
+		}
+		last_break = i;
+		i = (rand() % test_len / 8) & ALIGNMENT_MASK;
+
+	}
+	aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, vector->P + last_break,
+				  vector->Plen - last_break);
+	if (gctx->in_length != vector->Plen)
+		printf("%lu, %lu\n", gctx->in_length, vector->Plen);
+	aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+	openssl_aes_gcm_enc(vector->K, vector->IV,
+			    vector->IVlen, vector->A, vector->Alen, o_T_test,
+			    vector->Tlen, vector->P, vector->Plen, o_ct_test);
+	OK |=
+	    check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+	memcpy(ct_test, vector->C, vector->Plen);
+	memcpy(pt_test, vector->P, vector->Plen);
+	memset(vector->P, 0, vector->Plen);
+	memcpy(T_test, vector->T, vector->Tlen);
+	memset(vector->T, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+
+	last_break = 0;
+	i = 0;
+	aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+	while (i < (vector->Plen)) {
+		if (rand() % (test_len / 64) == 0) {
+			if (i - last_break != 0) {
+				ret = posix_memalign((void **)&stream, 64, i - last_break);
+				if ((ret != 0) || (stream == NULL)) {
+					OK = 1;
+					fprintf(stderr, "posix_memalign failed\n");
+					break;
+				}
+				memcpy(stream, vector->C + last_break, i - last_break);
+			}
+			aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, stream,
+						  i - last_break);
+			if (i - last_break != 0)
+				aligned_free(stream);
+
+			if (rand() % 1024 == 0) {
+				length = rand() % 100;
+
+				mk_rand_data(rand_data, length);
+				SHA1(rand_data, length, rand_data);
+			}
+
+			last_break = i;
+
+		}
+		if (rand() % 1024 != 0)
+			i++;
+
+	}
+	aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, vector->C + last_break,
+				  vector->Plen - last_break);
+	aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	memset(vector->P, 0, vector->Plen);
+	aes_gcm_dec_128_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	result =
+	    openssl_aes_gcm_dec(vector->K, vector->IV,
+				vector->IVlen, vector->A, vector->Alen,
+				vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+	if (-1 == result)
+		printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+	OK |= (-1 == result);
+	free(T_test);
+	free(o_T_test);
+	free(IV_c);
+	if (vector->Plen != 0) {
+		aligned_free(pt_test);
+		aligned_free(ct_test);
+		aligned_free(o_ct_test);
+	}
+	free(rand_data);
+
+	return OK;
+}
+
+int check_strm_vector2(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+		       gcm_vector * vector, int length, int start, int breaks)
+{
+	uint8_t *pt_test = NULL;
+	uint8_t *ct_test = NULL;
+	uint8_t *o_ct_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *o_T_test = NULL;
+	uint8_t *stream = NULL;
+	uint64_t IV_alloc_len = 0;
+	int result;
+	int OK = 0;
+	uint32_t last_break = 0;
+	int i = length;
+	uint8_t *rand_data = NULL;
+	int ret;
+
+	rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+	printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+	       (int)vector->Klen,
+	       (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+	printf(".");
+#endif
+	// Allocate space for the calculated ciphertext
+	if (vector->Plen != 0) {
+		pt_test = malloc(vector->Plen);
+		ct_test = malloc(vector->Plen);
+		ret = posix_memalign((void **)&o_ct_test, 64, vector->Plen);
+		if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL)
+		    || (o_ct_test == NULL)) {
+			fprintf(stderr, "Can't allocate ciphertext memory\n");
+			return 1;
+		}
+	}
+	IV_alloc_len = vector->IVlen;
+	// Allocate space for the calculated ciphertext
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	o_T_test = malloc(vector->Tlen);
+	if ((T_test == NULL) || (o_T_test == NULL)) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_128(vector->K, gkey);
+
+	////
+	// ISA-l Encrypt
+	////
+	aes_gcm_enc_128_nt(gkey, gctx, vector->C, vector->P, vector->Plen,
+			   IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+	while (i < (vector->Plen)) {
+		if (i - last_break != 0) {
+			ret = posix_memalign((void **)&stream, 64, i - last_break);
+			if ((ret != 0) || (stream == NULL)) {
+				OK = 1;
+				fprintf(stderr, "posix_memalign failed\n");
+				break;
+			}
+			memcpy(stream, vector->P + last_break, i - last_break);
+		}
+		aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, stream,
+					  i - last_break);
+		if (i - last_break != 0)
+			aligned_free(stream);
+		last_break = i;
+		i = i + (length - start) / breaks;
+
+	}
+	aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, vector->P + last_break,
+				  vector->Plen - last_break);
+	aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+	openssl_aes_gcm_enc(vector->K, vector->IV,
+			    vector->IVlen, vector->A, vector->Alen, o_T_test,
+			    vector->Tlen, vector->P, vector->Plen, o_ct_test);
+
+	OK |=
+	    check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+	memcpy(ct_test, vector->C, vector->Plen);
+	memcpy(pt_test, vector->P, vector->Plen);
+	memset(vector->P, 0, vector->Plen);
+	memcpy(T_test, vector->T, vector->Tlen);
+	memset(vector->T, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+
+	last_break = 0;
+	i = length;
+	aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+	while (i < (vector->Plen)) {
+		if (i - last_break != 0) {
+			ret = posix_memalign((void **)&stream, 64, i - last_break);
+			if ((ret != 0) || (stream == NULL)) {
+				OK = 1;
+				fprintf(stderr, "posix_memalign failed\n");
+				break;
+			}
+			memcpy(stream, vector->C + last_break, i - last_break);
+		}
+		aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, stream,
+					  i - last_break);
+		if (i - last_break != 0)
+			aligned_free(stream);
+		last_break = i;
+		i = i + (length - start) / breaks;
+
+	}
+
+	aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, vector->C + last_break,
+				  vector->Plen - last_break);
+	aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	memset(vector->P, 0, vector->Plen);
+	aes_gcm_dec_128_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	result =
+	    openssl_aes_gcm_dec(vector->K, vector->IV,
+				vector->IVlen, vector->A, vector->Alen,
+				vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+	if (-1 == result)
+		printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+	OK |= (-1 == result);
+	free(rand_data);
+	if (vector->Plen != 0) {
+		free(pt_test);
+		free(ct_test);
+		aligned_free(o_ct_test);
+	}
+
+	return OK;
+}
+
+int check_strm_vector_efence(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+			     gcm_vector * vector)
+{
+	uint8_t *pt_test = NULL;
+	uint8_t *ct_test = NULL;
+	uint8_t *o_ct_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *o_T_test = NULL;
+	uint8_t *stream = NULL;
+	uint64_t IV_alloc_len = 0;
+	int result;
+	int OK = 0;
+	uint32_t last_break = 0;
+	int i = 1;
+	uint8_t *rand_data = NULL;
+	uint64_t length;
+	int ret;
+
+	rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+	printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+	       (int)vector->Klen,
+	       (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+	printf(".");
+#endif
+	// Allocate space for the calculated ciphertext
+	if (vector->Plen != 0) {
+		ret = posix_memalign((void **)&pt_test, 64, vector->Plen);
+		ret |= posix_memalign((void **)&ct_test, 64, vector->Plen);
+		ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen);
+		if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL)
+		    || (o_ct_test == NULL)) {
+			fprintf(stderr, "Can't allocate ciphertext memory\n");
+			return 1;
+		}
+	}
+	IV_alloc_len = vector->IVlen;
+
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	o_T_test = malloc(vector->Tlen);
+	if ((T_test == NULL) || (o_T_test == NULL)) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_128(vector->K, gkey);
+
+	////
+	// ISA-l Encrypt
+	////
+	aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+	while (i < vector->Plen) {
+		if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) {
+			ret = posix_memalign((void **)&stream, 64, PAGE_LEN);
+			if ((ret != 0) || (stream == NULL)) {
+				OK = 1;
+				fprintf(stderr, "posix_memalign failed\n");
+				break;
+			}
+			i = i & ALIGNMENT_MASK;
+			memcpy(stream + PAGE_LEN - (i - last_break), vector->P + last_break,
+			       i - last_break);
+			aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break,
+						  stream + PAGE_LEN - (i - last_break),
+						  i - last_break);
+			aligned_free(stream);
+
+			if (rand() % 1024 == 0) {
+				length = rand() % 100;
+				mk_rand_data(rand_data, length);
+				SHA1(rand_data, length, rand_data);
+			}
+			last_break = i;
+		}
+		if (rand() % 1024 != 0)
+			i++;
+
+	}
+	aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, vector->P + last_break,
+				  vector->Plen - last_break);
+	aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+	openssl_aes_gcm_enc(vector->K, vector->IV,
+			    vector->IVlen, vector->A, vector->Alen, o_T_test,
+			    vector->Tlen, vector->P, vector->Plen, o_ct_test);
+	OK |=
+	    check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+	memcpy(ct_test, vector->C, vector->Plen);
+	memcpy(pt_test, vector->P, vector->Plen);
+	memset(vector->P, 0, vector->Plen);
+	memcpy(T_test, vector->T, vector->Tlen);
+	memset(vector->T, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+
+	last_break = 0;
+	i = 0;
+	aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+	while (i < vector->Plen) {
+		if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) {
+			ret = posix_memalign((void **)&stream, 64, PAGE_LEN);
+			if ((ret != 0) || (stream == NULL)) {
+				OK = 1;
+				fprintf(stderr, "posix_memalign failed\n");
+				break;
+			}
+			i = i & ALIGNMENT_MASK;
+			memcpy(stream + PAGE_LEN - (i - last_break), vector->C + last_break,
+			       i - last_break);
+			aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break,
+						  stream + PAGE_LEN - (i - last_break),
+						  i - last_break);
+			aligned_free(stream);
+
+			if (rand() % 1024 == 0) {
+				length = rand() % 100;
+
+				mk_rand_data(rand_data, length);
+				SHA1(rand_data, length, rand_data);
+			}
+
+			last_break = i;
+
+		}
+		if (rand() % 1024 != 0)
+			i++;
+
+	}
+	aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, vector->C + last_break,
+				  vector->Plen - last_break);
+	aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	memset(vector->P, 0, vector->Plen);
+	aes_gcm_dec_128_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	result =
+	    openssl_aes_gcm_dec(vector->K, vector->IV,
+				vector->IVlen, vector->A, vector->Alen,
+				vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+	if (-1 == result)
+		printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+	OK |= (-1 == result);
+	free(T_test);
+	free(o_T_test);
+	free(IV_c);
+	if (vector->Plen != 0) {
+		aligned_free(pt_test);
+		aligned_free(ct_test);
+		aligned_free(o_ct_test);
+	}
+	free(rand_data);
+
+	return OK;
+}
+
+int check_256_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+		     gcm_vector * vector)
+{
+	uint8_t *pt_test = NULL;
+	uint8_t *ct_test = NULL;
+	uint8_t *o_ct_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *o_T_test = NULL;
+	uint64_t IV_alloc_len = 0;
+	int result;
+	int OK = 0;
+	int ret;
+
+#ifdef GCM_VECTORS_VERBOSE
+	printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+	       (int)vector->Klen,
+	       (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+	printf(".");
+#endif
+	// Allocate space for the calculated ciphertext
+	if (vector->Plen != 0) {
+		ret = posix_memalign((void **)&pt_test, 64, vector->Plen);
+		ret |= posix_memalign((void **)&ct_test, 64, vector->Plen);
+		ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen);
+		if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL)
+		    || (o_ct_test == NULL)) {
+			fprintf(stderr, "Can't allocate ciphertext memory\n");
+			return 1;
+		}
+	}
+	IV_alloc_len = vector->IVlen;
+	// Allocate space for the calculated ciphertext
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	o_T_test = malloc(vector->Tlen);
+	if ((T_test == NULL) || (o_T_test == NULL)) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_256(vector->K, gkey);
+
+	////
+	// ISA-l Encrypt
+	////
+	aes_gcm_enc_256_nt(gkey, gctx, vector->C, vector->P, vector->Plen,
+			   IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	openssl_aes_256_gcm_enc(vector->K, vector->IV,
+				vector->IVlen, vector->A, vector->Alen, o_T_test,
+				vector->Tlen, vector->P, vector->Plen, o_ct_test);
+	OK |=
+	    check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+	memcpy(ct_test, vector->C, vector->Plen);
+	memcpy(pt_test, vector->P, vector->Plen);
+	memset(vector->P, 0, vector->Plen);
+	memcpy(T_test, vector->T, vector->Tlen);
+	memset(vector->T, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+	aes_gcm_dec_256_nt(gkey, gctx, vector->P, vector->C, vector->Plen,
+			   IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)");
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "ISA-L decrypted ISA-L plain text (P)");
+	memset(vector->P, 0, vector->Plen);
+	aes_gcm_dec_256_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "ISA-L decrypted OpenSSL plain text (P)");
+	result =
+	    openssl_aes_256_gcm_dec(vector->K, vector->IV,
+				    vector->IVlen, vector->A, vector->Alen,
+				    vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+	if (-1 == result)
+		printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+	OK |= (-1 == result);
+	free(T_test);
+	free(o_T_test);
+	free(IV_c);
+	if (vector->Plen != 0) {
+		aligned_free(pt_test);
+		aligned_free(ct_test);
+		aligned_free(o_ct_test);
+	}
+
+	return OK;
+}
+
+int check_256_strm_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+			  gcm_vector * vector, int test_len)
+{
+	uint8_t *pt_test = NULL;
+	uint8_t *ct_test = NULL;
+	uint8_t *o_ct_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *o_T_test = NULL;
+	uint8_t *stream = NULL;
+	uint64_t IV_alloc_len = 0;
+	int result;
+	int OK = 0;
+	uint32_t last_break;
+	int i, ret;
+	uint8_t *rand_data = NULL;
+	uint64_t length;
+
+	rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+	printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+	       (int)vector->Klen,
+	       (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+	printf(".");
+#endif
+	// Allocate space for the calculated ciphertext
+	if (vector->Plen != 0) {
+		ret = posix_memalign((void **)&pt_test, 64, vector->Plen);
+		ret |= posix_memalign((void **)&ct_test, 64, vector->Plen);
+		ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen);
+		if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL)
+		    || (o_ct_test == NULL)) {
+			fprintf(stderr, "Can't allocate ciphertext memory\n");
+			return 1;
+		}
+	}
+	IV_alloc_len = vector->IVlen;
+	// Allocate space for the calculated ciphertext
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	o_T_test = malloc(vector->Tlen);
+	if ((T_test == NULL) || (o_T_test == NULL)) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_256(vector->K, gkey);
+
+	////
+	// ISA-l Encrypt
+	////
+	aes_gcm_init_256(gkey, gctx, IV_c, vector->A, vector->Alen);
+
+	last_break = 0;
+	i = (rand() % test_len / 8) & ALIGNMENT_MASK;
+	while (i < (vector->Plen)) {
+		if (i - last_break != 0) {
+			ret = posix_memalign((void **)&stream, 64, i - last_break);
+			if ((ret != 0) || (stream == NULL)) {
+				OK = 1;
+				fprintf(stderr, "posix_memalign failed\n");
+				break;
+			}
+			memcpy(stream, vector->P + last_break, i - last_break);
+		}
+
+		aes_gcm_enc_256_update_nt(gkey, gctx, vector->C + last_break, stream,
+					  i - last_break);
+		if (i - last_break != 0)
+			free(stream);
+
+		if (rand() % 1024 == 0) {
+			length = rand() % 100;
+			mk_rand_data(rand_data, length);
+			SHA1(rand_data, length, rand_data);
+		}
+		last_break = i;
+		i += (rand() % test_len / 8) & ALIGNMENT_MASK;
+
+	}
+	aes_gcm_enc_256_update_nt(gkey, gctx, vector->C + last_break, vector->P + last_break,
+				  vector->Plen - last_break);
+	if (gctx->in_length != vector->Plen)
+		printf("%lu, %lu\n", gctx->in_length, vector->Plen);
+	aes_gcm_enc_256_finalize(gkey, gctx, vector->T, vector->Tlen);
+
+	openssl_aes_256_gcm_enc(vector->K, vector->IV,
+				vector->IVlen, vector->A, vector->Alen, o_T_test,
+				vector->Tlen, vector->P, vector->Plen, o_ct_test);
+	OK |=
+	    check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+	memcpy(ct_test, vector->C, vector->Plen);
+	memcpy(pt_test, vector->P, vector->Plen);
+	memset(vector->P, 0, vector->Plen);
+	memcpy(T_test, vector->T, vector->Tlen);
+	memset(vector->T, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+
+	last_break = 0;
+	i += (rand() % test_len / 8) & ALIGNMENT_MASK;
+	aes_gcm_init_256(gkey, gctx, IV_c, vector->A, vector->Alen);
+	while (i < (vector->Plen)) {
+		if (i - last_break != 0) {
+			ret = posix_memalign((void **)&stream, 64, i - last_break);
+			if ((ret != 0) || (stream == NULL)) {
+				OK = 1;
+				fprintf(stderr, "posix_memalign failed\n");
+				break;
+			}
+			memcpy(stream, vector->C + last_break, i - last_break);
+		}
+
+		aes_gcm_dec_256_update_nt(gkey, gctx, vector->P + last_break, stream,
+					  i - last_break);
+		if (i - last_break != 0)
+			aligned_free(stream);
+
+		if (rand() % 1024 == 0) {
+			length = rand() % 100;
+
+			mk_rand_data(rand_data, length);
+			SHA1(rand_data, length, rand_data);
+		}
+
+		last_break = i;
+		i += (rand() % test_len / 8) & ALIGNMENT_MASK;
+
+	}
+	aes_gcm_dec_256_update_nt(gkey, gctx, vector->P + last_break, vector->C + last_break,
+				  vector->Plen - last_break);
+	aes_gcm_dec_256_finalize(gkey, gctx, vector->T, vector->Tlen);
+
+	OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)");
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "ISA-L decrypted ISA-L plain text (P)");
+	memset(vector->P, 0, vector->Plen);
+	aes_gcm_dec_256_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "ISA-L decrypted OpenSSL plain text (P)");
+	result =
+	    openssl_aes_256_gcm_dec(vector->K, vector->IV,
+				    vector->IVlen, vector->A, vector->Alen,
+				    vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+	if (-1 == result)
+		printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+	OK |= (-1 == result);
+	free(T_test);
+	free(o_T_test);
+	free(IV_c);
+	if (vector->Plen != 0) {
+		aligned_free(pt_test);
+		aligned_free(ct_test);
+		aligned_free(o_ct_test);
+	}
+
+	return OK;
+}
+
+int test_gcm_strm_efence(void)
+{
+	gcm_vector test;
+	int tag_len = 8;
+	int t = 0;
+	struct gcm_key_data *gkey = NULL;
+	struct gcm_context_data *gctx = NULL;
+	int ret;
+
+	gkey = malloc(sizeof(struct gcm_key_data));
+	gctx = malloc(sizeof(struct gcm_context_data));
+	if (NULL == gkey || NULL == gctx)
+		return 1;
+
+	printf("AES GCM random efence test vectors with random stream:");
+	for (t = 0; RANDOMS > t; t++) {
+		int Plen = (rand() % TEST_LEN);
+		//lengths must be a multiple of 4 bytes
+		int aad_len = (rand() % TEST_LEN);
+		int offset = (rand() % MAX_UNALIGNED);
+		if (offset == 0 && aad_len == 0)
+			offset = OFFSET_BASE_VALUE;
+
+		if (0 == (t % 25))
+			printf("\n");
+		if (0 == (t % 10))
+			fflush(0);
+		test.P = NULL;
+		test.C = NULL;
+		test.A = NULL;
+		test.T = NULL;
+		test.Plen = Plen;
+		if (test.Plen + offset != 0) {
+			ret = posix_memalign((void **)&test.P, 64, test.Plen + offset);
+			ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset);
+		} else {	//This else clause is here because openssl 1.0.1k does not handle NULL pointers
+			ret = posix_memalign((void **)&test.P, 64, 16);
+			ret |= posix_memalign((void **)&test.C, 64, 16);
+		}
+		if (ret != 0) {
+			printf("posix_memalign for testsize:0x%x failed\n", Plen);
+			return 1;
+		}
+		test.K = malloc(GCM_128_KEY_LEN + offset);
+		test.Klen = GCM_128_KEY_LEN;
+		test.IV = malloc(GCM_IV_DATA_LEN + offset);
+		test.IVlen = GCM_IV_DATA_LEN;
+		test.A = malloc(aad_len + offset);
+		test.Alen = aad_len;
+		test.T = malloc(MAX_TAG_LEN + offset);
+
+		if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+		    || (NULL == test.IV)) {
+			printf("malloc of testsize:0x%x failed\n", Plen);
+			return 1;
+		}
+
+		test.P += offset;
+		test.C += offset;
+		test.K += offset;
+		test.IV += offset;
+		test.A += offset;
+		test.T += offset;
+
+		mk_rand_data(test.P, test.Plen);
+		mk_rand_data(test.K, test.Klen);
+		mk_rand_data(test.IV, test.IVlen);
+		mk_rand_data(test.A, test.Alen);
+
+		// single Key length of 128bits/16bytes supported
+		// single IV length of 96bits/12bytes supported
+		// Tag lengths of 8, 12 or 16
+		for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+			test.Tlen = tag_len;
+			if (0 != check_strm_vector_efence(gkey, gctx, &test))
+				return 1;
+			tag_len += 4;	//supported lengths are 8, 12 or 16
+		}
+		test.A -= offset;
+		free(test.A);
+		test.C -= offset;
+		aligned_free(test.C);
+		test.IV -= offset;
+		free(test.IV);
+		test.K -= offset;
+		free(test.K);
+		test.P -= offset;
+		aligned_free(test.P);
+		test.T -= offset;
+		free(test.T);
+	}
+	printf("\n");
+	free(gkey);
+	free(gctx);
+	return 0;
+}
+
+int test_gcm_strm_combinations(int test_len)
+{
+	gcm_vector test;
+	int tag_len = 8;
+	int t = 0;
+	uint8_t *gkeytemp = NULL;
+	struct gcm_key_data *gkey = NULL;
+	struct gcm_context_data *gctx = NULL;
+	int ret;
+
+	gkeytemp = malloc(sizeof(struct gcm_key_data) + 64);
+	gctx = malloc(sizeof(struct gcm_context_data));
+	gkey = (struct gcm_key_data *)(gkeytemp + rand() % 64);
+	if (NULL == gkey || NULL == gctx)
+		return 1;
+
+	printf("AES GCM random test vectors with random stream of average size %d:",
+	       test_len / 64);
+	for (t = 0; RANDOMS > t; t++) {
+		int Plen = 0;	// (rand() % test_len);
+		//lengths must be a multiple of 4 bytes
+		int aad_len = (rand() % test_len);
+		int offset = (rand() % MAX_UNALIGNED);
+		if (offset == 0 && aad_len == 0)
+			offset = OFFSET_BASE_VALUE;
+
+		if (0 == (t % 25))
+			printf("\n");
+		if (0 == (t % 10))
+			fflush(0);
+		test.P = NULL;
+		test.C = NULL;
+		test.A = NULL;
+		test.T = NULL;
+		test.Plen = Plen;
+		if (test.Plen + offset != 0) {
+			ret = posix_memalign((void **)&test.P, 64, test.Plen + offset);
+			ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset);
+		} else {	//This else clause is here because openssl 1.0.1k does not handle NULL pointers
+			ret = posix_memalign((void **)&test.P, 64, 16);
+			ret |= posix_memalign((void **)&test.C, 64, 16);
+		}
+		if (ret != 0) {
+			printf("posix_memalign for testsize:0x%x failed\n", Plen);
+			return 1;
+		}
+		test.K = malloc(GCM_128_KEY_LEN + offset);
+		test.Klen = GCM_128_KEY_LEN;
+		test.IV = malloc(GCM_IV_DATA_LEN + offset);
+		test.IVlen = GCM_IV_DATA_LEN;
+		test.A = malloc(aad_len + offset);
+
+		test.Alen = aad_len;
+		test.T = malloc(MAX_TAG_LEN + offset);
+
+		if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+		    || (NULL == test.IV)) {
+			printf("malloc of testsize:0x%x failed\n", Plen);
+			return 1;
+		}
+
+		test.P += offset;
+		test.C += offset;
+		test.K += offset;
+		test.IV += offset;
+		test.A += offset;
+		test.T += offset;
+
+		mk_rand_data(test.P, test.Plen);
+		mk_rand_data(test.K, test.Klen);
+		mk_rand_data(test.IV, test.IVlen);
+		mk_rand_data(test.A, test.Alen);
+
+		// single Key length of 128bits/16bytes supported
+		// single IV length of 96bits/12bytes supported
+		// Tag lengths of 8, 12 or 16
+		for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+			test.Tlen = tag_len;
+			if (0 != check_strm_vector(gkey, gctx, &test, test_len))
+				return 1;
+			tag_len += 4;	//supported lengths are 8, 12 or 16
+		}
+		test.A -= offset;
+		free(test.A);
+		test.C -= offset;
+		aligned_free(test.C);
+		test.IV -= offset;
+		free(test.IV);
+		test.K -= offset;
+		free(test.K);
+		test.P -= offset;
+		aligned_free(test.P);
+		test.T -= offset;
+		free(test.T);
+	}
+	printf("\n");
+	free(gkeytemp);
+	free(gctx);
+	return 0;
+}
+
+int test_gcm_combinations(void)
+{
+	gcm_vector test;
+	int tag_len = 8;
+	int t = 0;
+	struct gcm_key_data *gkey = NULL;
+	struct gcm_context_data *gctx = NULL;
+	int ret;
+
+	gkey = malloc(sizeof(struct gcm_key_data));
+	gctx = malloc(sizeof(struct gcm_context_data));
+	if (NULL == gkey || NULL == gctx)
+		return 1;
+
+	printf("AES GCM random test vectors:");
+	for (t = 0; RANDOMS > t; t++) {
+		int Plen = (rand() % TEST_LEN);
+		//lengths must be a multiple of 4 bytes
+		int aad_len = (rand() % TEST_LEN);
+		int offset = (rand() % MAX_UNALIGNED);
+		if (offset == 0 && aad_len == 0)
+			offset = OFFSET_BASE_VALUE;
+
+		if (0 == (t % 25))
+			printf("\n");
+		if (0 == (t % 10))
+			fflush(0);
+		test.P = NULL;
+		test.C = NULL;
+		test.A = NULL;
+		test.T = NULL;
+		test.Plen = Plen;
+		if (test.Plen + offset != 0) {
+			ret = posix_memalign((void **)&test.P, 64, test.Plen + offset);
+			ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset);
+		} else {	//This else clause is here because openssl 1.0.1k does not handle NULL pointers
+			ret = posix_memalign((void **)&test.P, 64, 16);
+			ret |= posix_memalign((void **)&test.C, 64, 16);
+		}
+		if (ret != 0) {
+			printf("posix_memalign for testsize:0x%x failed\n", Plen);
+			return 1;
+		}
+		test.K = malloc(GCM_128_KEY_LEN + offset);
+		test.Klen = GCM_128_KEY_LEN;
+		test.IV = malloc(GCM_IV_DATA_LEN + offset);
+		test.IVlen = GCM_IV_DATA_LEN;
+		test.A = malloc(aad_len + offset);
+
+		test.Alen = aad_len;
+		test.T = malloc(MAX_TAG_LEN + offset);
+
+		if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+		    || (NULL == test.IV)) {
+			printf("malloc of testsize:0x%x failed\n", Plen);
+			return 1;
+		}
+
+		test.P += offset;
+		test.C += offset;
+		test.K += offset;
+		test.IV += offset;
+		test.A += offset;
+		test.T += offset;
+
+		mk_rand_data(test.P, test.Plen);
+		mk_rand_data(test.K, test.Klen);
+		mk_rand_data(test.IV, test.IVlen);
+		mk_rand_data(test.A, test.Alen);
+
+		// single Key length of 128bits/16bytes supported
+		// single IV length of 96bits/12bytes supported
+		// Tag lengths of 8, 12 or 16
+		for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+			test.Tlen = tag_len;
+			if (0 != check_vector(gkey, gctx, &test))
+				return 1;
+			tag_len += 4;	//supported lengths are 8, 12 or 16
+		}
+		test.A -= offset;
+		free(test.A);
+		test.C -= offset;
+		aligned_free(test.C);
+		test.IV -= offset;
+		free(test.IV);
+		test.K -= offset;
+		free(test.K);
+		test.P -= offset;
+		aligned_free(test.P);
+		test.T -= offset;
+		free(test.T);
+	}
+	printf("\n");
+	free(gkey);
+	free(gctx);
+	return 0;
+}
+
+int test_gcm256_combinations(void)
+{
+	gcm_vector test;
+	int tag_len = 8;
+	int t = 0;
+	struct gcm_key_data *gkey = NULL;
+	struct gcm_context_data *gctx = NULL;
+	int ret;
+
+	gkey = malloc(sizeof(struct gcm_key_data));
+	gctx = malloc(sizeof(struct gcm_context_data));
+	if (NULL == gkey || NULL == gctx)
+		return 1;
+
+	printf("AES-GCM-256 random test vectors:");
+	for (t = 0; RANDOMS > t; t++) {
+		int Plen = (rand() % TEST_LEN);
+		//lengths must be a multiple of 4 bytes
+		int aad_len = (rand() % TEST_LEN);
+		int offset = (rand() % MAX_UNALIGNED);
+		if (offset == 0 && aad_len == 0)
+			offset = OFFSET_BASE_VALUE;
+
+		if (0 == (t % 25))
+			printf("\n");
+		if (0 == (t % 10))
+			fflush(0);
+		test.P = NULL;
+		test.C = NULL;
+		test.A = NULL;
+		test.T = NULL;
+		test.Plen = Plen;
+		if (test.Plen + offset != 0) {
+			ret = posix_memalign((void **)&test.P, 64, test.Plen + offset);
+			ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset);
+		} else {	//This else clause is here because openssl 1.0.1k does not handle NULL pointers
+			ret = posix_memalign((void **)&test.P, 64, 16);
+			ret |= posix_memalign((void **)&test.C, 64, 16);
+		}
+		if (ret != 0) {
+			printf("posix_memalign for testsize:0x%x failed\n", Plen);
+			return 1;
+		}
+		test.K = malloc(GCM_256_KEY_LEN + offset);
+		test.Klen = GCM_256_KEY_LEN;
+		test.IV = malloc(GCM_IV_DATA_LEN + offset);
+		test.IVlen = GCM_IV_DATA_LEN;
+		test.A = malloc(aad_len + offset);
+
+		test.Alen = aad_len;
+		test.T = malloc(MAX_TAG_LEN + offset);
+
+		if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+		    || (NULL == test.IV)) {
+			printf("malloc of testsize:0x%x failed\n", Plen);
+			return 1;
+		}
+
+		test.P += offset;
+		test.C += offset;
+		test.K += offset;
+		test.IV += offset;
+		test.A += offset;
+		test.T += offset;
+
+		mk_rand_data(test.P, test.Plen);
+		mk_rand_data(test.K, test.Klen);
+		mk_rand_data(test.IV, test.IVlen);
+		mk_rand_data(test.A, test.Alen);
+
+		// single Key length of 128bits/16bytes supported
+		// single IV length of 96bits/12bytes supported
+		// Tag lengths of 8, 12 or 16
+		for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+			test.Tlen = tag_len;
+			if (0 != check_256_vector(gkey, gctx, &test))
+				return 1;
+			tag_len += 4;	//supported lengths are 8, 12 or 16
+		}
+		test.A -= offset;
+		free(test.A);
+		test.C -= offset;
+		aligned_free(test.C);
+		test.IV -= offset;
+		free(test.IV);
+		test.K -= offset;
+		free(test.K);
+		test.P -= offset;
+		aligned_free(test.P);
+		test.T -= offset;
+		free(test.T);
+	}
+	printf("\n");
+	free(gkey);
+	free(gctx);
+	return 0;
+}
+
+int test_gcm256_strm_combinations(int test_len)
+{
+	gcm_vector test;
+	int tag_len = 8;
+	int t = 0;
+	uint8_t *gkeytemp = NULL;
+	struct gcm_key_data *gkey = NULL;
+	struct gcm_context_data *gctx = NULL;
+	int ret;
+
+	gkeytemp = malloc(sizeof(struct gcm_key_data) + 64);
+	gctx = malloc(sizeof(struct gcm_context_data));
+	gkey = (struct gcm_key_data *)(gkeytemp + rand() % 64);
+	if (NULL == gkey || NULL == gctx)
+		return 1;
+
+	printf("AES-GCM-256 random test vectors with random stream of average size %d:",
+	       test_len / 64);
+	for (t = 0; RANDOMS > t; t++) {
+		int Plen = (rand() % test_len);
+		//lengths must be a multiple of 4 bytes
+		int aad_len = (rand() % test_len);
+		int offset = (rand() % MAX_UNALIGNED);
+		if (offset == 0 && aad_len == 0)
+			offset = OFFSET_BASE_VALUE;
+
+		if (0 == (t % 25))
+			printf("\n");
+		if (0 == (t % 10))
+			fflush(0);
+		test.P = NULL;
+		test.C = NULL;
+		test.A = NULL;
+		test.T = NULL;
+		test.Plen = Plen;
+		if (test.Plen + offset != 0) {
+			ret = posix_memalign((void **)&test.P, 64, test.Plen + offset);
+			ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset);
+		} else {	//This else clause is here because openssl 1.0.1k does not handle NULL pointers
+			ret = posix_memalign((void **)&test.P, 64, 16);
+			ret |= posix_memalign((void **)&test.C, 64, 16);
+		}
+		if (ret != 0) {
+			printf("posix_memalign for testsize:0x%x failed\n", Plen);
+			return 1;
+		}
+		test.K = malloc(GCM_256_KEY_LEN + offset);
+		test.Klen = GCM_256_KEY_LEN;
+		test.IV = malloc(GCM_IV_DATA_LEN + offset);
+		test.IVlen = GCM_IV_DATA_LEN;
+		test.A = malloc(aad_len + offset);
+
+		test.Alen = aad_len;
+		test.T = malloc(MAX_TAG_LEN + offset);
+
+		if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+		    || (NULL == test.IV)) {
+			printf("malloc of testsize:0x%x failed\n", Plen);
+			return 1;
+		}
+
+		test.P += offset;
+		test.C += offset;
+		test.K += offset;
+		test.IV += offset;
+		test.A += offset;
+		test.T += offset;
+
+		mk_rand_data(test.P, test.Plen);
+		mk_rand_data(test.K, test.Klen);
+		mk_rand_data(test.IV, test.IVlen);
+		mk_rand_data(test.A, test.Alen);
+
+		// single Key length of 128bits/16bytes supported
+		// single IV length of 96bits/12bytes supported
+		// Tag lengths of 8, 12 or 16
+		for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+			test.Tlen = tag_len;
+			if (0 != check_256_strm_vector(gkey, gctx, &test, test_len))
+				return 1;
+			tag_len += 4;	//supported lengths are 8, 12 or 16
+		}
+		test.A -= offset;
+		free(test.A);
+		test.C -= offset;
+		aligned_free(test.C);
+		test.IV -= offset;
+		free(test.IV);
+		test.K -= offset;
+		free(test.K);
+		test.P -= offset;
+		aligned_free(test.P);
+		test.T -= offset;
+		free(test.T);
+	}
+	printf("\n");
+	free(gkeytemp);
+	free(gctx);
+	return 0;
+}
+
+//
+// place all data to end at a page boundary to check for read past the end
+//
+int test_gcm_efence(void)
+{
+	gcm_vector test;
+	int offset = 0;
+	gcm_key_size key_len;
+	struct gcm_key_data *gkey = NULL;
+	struct gcm_context_data *gctx = NULL;
+	uint8_t *P = NULL, *C = NULL, *K, *IV, *A, *T;
+	int ret;
+
+	gkey = malloc(sizeof(struct gcm_key_data));
+	gctx = malloc(sizeof(struct gcm_context_data));
+	ret = posix_memalign((void **)&P, 64, PAGE_LEN);
+	ret |= posix_memalign((void **)&C, 64, PAGE_LEN);
+	K = malloc(PAGE_LEN);
+	IV = malloc(PAGE_LEN);
+	A = malloc(PAGE_LEN);
+	T = malloc(PAGE_LEN);
+	if ((0 != ret) || (NULL == P) || (NULL == C) || (NULL == K) || (NULL == IV)
+	    || (NULL == A) || (NULL == T) || (NULL == gkey) || (NULL == gctx)) {
+		printf("malloc of testsize:0x%x failed\n", PAGE_LEN);
+		return -1;
+	}
+
+	test.Plen = PAGE_LEN / 2;
+	// place buffers to end at page boundary
+	test.IVlen = GCM_IV_DATA_LEN;
+	test.Alen = test.Plen;
+	test.Tlen = MAX_TAG_LEN;
+
+	printf("AES GCM efence test vectors:");
+	for (key_len = GCM_128_KEY_LEN; GCM_256_KEY_LEN >= key_len;
+	     key_len += (GCM_256_KEY_LEN - GCM_128_KEY_LEN)) {
+		test.Klen = key_len;
+		for (offset = 0; MAX_UNALIGNED > offset; offset++) {
+			if (0 == (offset % 80))
+				printf("\n");
+			// move the start and size of the data block towards the end of the page
+			test.Plen = (PAGE_LEN / 2) - offset;
+			test.Alen = (PAGE_LEN / 4) - (offset * 4);	//lengths must be a multiple of 4 bytes
+			//Place data at end of page
+			test.P = P + PAGE_LEN - test.Plen;
+			test.C = C + PAGE_LEN - test.Plen;
+			test.K = K + PAGE_LEN - test.Klen;
+			test.IV = IV + PAGE_LEN - test.IVlen;
+			test.A = A + PAGE_LEN - test.Alen;
+			test.T = T + PAGE_LEN - test.Tlen;
+
+			mk_rand_data(test.P, test.Plen);
+			mk_rand_data(test.K, test.Klen);
+			mk_rand_data(test.IV, test.IVlen);
+			mk_rand_data(test.A, test.Alen);
+			if (GCM_128_KEY_LEN == key_len) {
+				if (0 != check_vector(gkey, gctx, &test))
+					return 1;
+			} else {
+				if (0 != check_256_vector(gkey, gctx, &test))
+					return 1;
+			}
+		}
+	}
+	free(gkey);
+	free(gctx);
+	free(P);
+	free(C);
+	free(K);
+	free(IV);
+	free(A);
+	free(T);
+
+	printf("\n");
+	return 0;
+}
+
+int test_gcm128_std_vectors(gcm_vector const *vector)
+{
+	struct gcm_key_data gkey;
+	struct gcm_context_data gctx;
+	int OK = 0;
+	// Temporary array for the calculated vectors
+	uint8_t *ct_test = NULL;
+	uint8_t *pt_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *T2_test = NULL;
+	uint64_t IV_alloc_len = 0;
+	int result;
+	int ret;
+
+#ifdef GCM_VECTORS_VERBOSE
+	printf("AES-GCM-128:\n");
+#endif
+
+	// Allocate space for the calculated ciphertext
+	ret = posix_memalign((void **)&ct_test, 64, vector->Plen);
+	// Allocate space for the calculated plaintext
+	ret |= posix_memalign((void **)&pt_test, 64, vector->Plen);
+	if ((ret != 0) || (ct_test == NULL) || (pt_test == NULL)) {
+		fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+		return 1;
+	}
+	IV_alloc_len = vector->IVlen;
+
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	T2_test = malloc(vector->Tlen);
+	if ((T_test == NULL) || (T2_test == NULL)) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_128(vector->K, &gkey);
+#ifdef GCM_VECTORS_VERBOSE
+	dump_gcm_data(&gkey);
+#endif
+
+	////
+	// ISA-l Encrypt
+	////
+	memset(ct_test, 0, vector->Plen);
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+	openssl_aes_gcm_enc(vector->K, vector->IV,
+			    vector->IVlen, vector->A,
+			    vector->Alen, pt_test, vector->Tlen,
+			    vector->P, vector->Plen, ct_test);
+	OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L tag (T)");
+	// test of in-place encrypt
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_enc_128_nt(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+			   vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->C, vector->Plen,
+		       "ISA-L encrypted cypher text(in-place)");
+	memset(ct_test, 0, vector->Plen);
+	memset(T_test, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+	memcpy(ct_test, vector->C, vector->Plen);
+	aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	// GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+	// test in in-place decrypt
+	memcpy(ct_test, vector->C, vector->Plen);
+	aes_gcm_dec_128_nt(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+			   vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+	OK |=
+	    check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+	// ISA-L enc -> ISA-L dec
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	memset(pt_test, 0, vector->Plen);
+	aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+			   vector->A, vector->Alen, T2_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "ISA-L self decrypted plain text (P)");
+	OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+	// OpenSSl enc -> ISA-L dec
+	openssl_aes_gcm_enc(vector->K, vector->IV,
+			    vector->IVlen, vector->A,
+			    vector->Alen, T_test, vector->Tlen,
+			    vector->P, vector->Plen, ct_test);
+	OK |=
+	    check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)");
+
+	memset(pt_test, 0, vector->Plen);
+	aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+			   vector->A, vector->Alen, T2_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "OpenSSL->ISA-L decrypted plain text (P)");
+	OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)");
+	// ISA-L enc -> OpenSSl dec
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	memset(pt_test, 0, vector->Plen);
+	result =
+	    openssl_aes_gcm_dec(vector->K, vector->IV,
+				vector->IVlen, vector->A,
+				vector->Alen, T_test, vector->Tlen,
+				ct_test, vector->Plen, pt_test);
+	if (-1 == result)
+		printf("  ISA-L->OpenSSL decryption failed Authentication\n");
+	OK |= (-1 == result);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)");
+	if (NULL != ct_test)
+		aligned_free(ct_test);
+	if (NULL != pt_test)
+		aligned_free(pt_test);
+	if (NULL != IV_c)
+		free(IV_c);
+	if (NULL != T_test)
+		free(T_test);
+	if (NULL != T2_test)
+		free(T2_test);
+
+	return OK;
+}
+
+int test_gcm256_std_vectors(gcm_vector const *vector)
+{
+	struct gcm_key_data gkey;
+	struct gcm_context_data gctx;
+	int OK = 0;
+	// Temporary array for the calculated vectors
+	uint8_t *ct_test = NULL;
+	uint8_t *pt_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *T2_test = NULL;
+	uint64_t IV_alloc_len = 0;
+	int result;
+	int ret;
+
+#ifdef GCM_VECTORS_VERBOSE
+	printf("AES-GCM-256:\n");
+#endif
+
+	// Allocate space for the calculated ciphertext
+	ret = posix_memalign((void **)&ct_test, 64, vector->Plen);
+	// Allocate space for the calculated plaintext
+	ret |= posix_memalign((void **)&pt_test, 64, vector->Plen);
+	if ((ret != 0) || (ct_test == NULL) || (pt_test == NULL)) {
+		fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+		return 1;
+	}
+	IV_alloc_len = vector->IVlen;
+
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	T2_test = malloc(vector->Tlen);
+	if (T_test == NULL) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_256(vector->K, &gkey);
+#ifdef GCM_VECTORS_VERBOSE
+	dump_gcm_data(&gkey);
+#endif
+
+	////
+	// ISA-l Encrypt
+	////
+	memset(ct_test, 0, vector->Plen);
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+	openssl_aes_256_gcm_enc(vector->K, vector->IV,
+				vector->IVlen, vector->A,
+				vector->Alen, pt_test, vector->Tlen,
+				vector->P, vector->Plen, ct_test);
+	OK |= check_data(ct_test, vector->C, vector->Tlen, "OpenSSL vs KA - cypher text (C)");
+	OK |= check_data(pt_test, vector->T, vector->Tlen, "OpenSSL vs KA - tag (T)");
+	OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L - tag (T)");
+	// test of in-place encrypt
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_enc_256_nt(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+			   vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->C, vector->Plen,
+		       "ISA-L encrypted cypher text(in-place)");
+	memset(ct_test, 0, vector->Plen);
+	memset(T_test, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+	memcpy(ct_test, vector->C, vector->Plen);
+	aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	// GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+	// test in in-place decrypt
+	memcpy(ct_test, vector->C, vector->Plen);
+	aes_gcm_dec_256_nt(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+			   vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+	OK |=
+	    check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+	// ISA-L enc -> ISA-L dec
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	memset(pt_test, 0, vector->Plen);
+	aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+			   vector->A, vector->Alen, T2_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "ISA-L self decrypted plain text (P)");
+	OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+	// OpenSSl enc -> ISA-L dec
+	openssl_aes_256_gcm_enc(vector->K, vector->IV,
+				vector->IVlen, vector->A,
+				vector->Alen, T_test, vector->Tlen,
+				vector->P, vector->Plen, ct_test);
+	OK |=
+	    check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)");
+	memset(pt_test, 0, vector->Plen);
+	aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+			   vector->A, vector->Alen, T2_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "OpenSSL->ISA-L decrypted plain text (P)");
+	OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)");
+	// ISA-L enc -> OpenSSl dec
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	memset(pt_test, 0, vector->Plen);
+	result =
+	    openssl_aes_256_gcm_dec(vector->K, vector->IV,
+				    vector->IVlen, vector->A,
+				    vector->Alen, T_test, vector->Tlen,
+				    ct_test, vector->Plen, pt_test);
+	if (-1 == result)
+		printf("  ISA-L->OpenSSL decryption failed Authentication\n");
+	OK |= (-1 == result);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)");
+	if (NULL != ct_test)
+		aligned_free(ct_test);
+	if (NULL != pt_test)
+		aligned_free(pt_test);
+	if (NULL != IV_c)
+		free(IV_c);
+	if (NULL != T_test)
+		free(T_test);
+	if (NULL != T2_test)
+		free(T2_test);
+
+	return OK;
+}
+
+int test_gcm_std_vectors(void)
+{
+	int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]);
+	int vect;
+	int OK = 0;
+
+	printf("AES-GCM standard test vectors:\n");
+	for (vect = 0; vect < vectors_cnt; vect++) {
+#ifdef GCM_VECTORS_VERBOSE
+		printf
+		    ("Standard vector %d/%d  Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+		     vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen,
+		     (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen,
+		     (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen);
+#else
+		printf(".");
+#endif
+
+		if (BITS_128 == gcm_vectors[vect].Klen) {
+			OK |= test_gcm128_std_vectors(&gcm_vectors[vect]);
+		} else {
+			OK |= test_gcm256_std_vectors(&gcm_vectors[vect]);
+		}
+		if (0 != OK)
+			return OK;
+	}
+	printf("\n");
+	return OK;
+}
+
+// The length of the data is set to length. The first stream is from 0 to start. After
+// that the data is broken into breaks chunks of equal size (except possibly the last
+// one due to divisibility).
+int test_gcm_strm_combinations2(int length, int start, int breaks)
+{
+	gcm_vector test;
+	int tag_len = 8;
+	int t = 0;
+	struct gcm_key_data *gkey = NULL;
+	struct gcm_context_data *gctx = NULL;
+	int ret;
+
+	gkey = malloc(sizeof(struct gcm_key_data));
+	gctx = malloc(sizeof(struct gcm_context_data));
+	if (NULL == gkey || NULL == gctx)
+		return 1;
+
+	printf("AES GCM random test vectors of length %d and stream with %d breaks:", length,
+	       breaks + 1);
+	for (t = 0; RANDOMS > t; t++) {
+		int Plen = length;
+		//lengths must be a multiple of 4 bytes
+		int aad_len = (rand() % TEST_LEN);
+		int offset = (rand() % MAX_UNALIGNED);
+		if (offset == 0 && aad_len == 0)
+			offset = OFFSET_BASE_VALUE;
+
+		if (0 == (t % 25))
+			printf("\n");
+		if (0 == (t % 10))
+			fflush(0);
+		test.P = NULL;
+		test.C = NULL;
+		test.A = NULL;
+		test.T = NULL;
+		test.Plen = Plen;
+		if (test.Plen + offset != 0) {
+			ret = posix_memalign((void **)&test.P, 64, test.Plen + offset);
+			ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset);
+		} else {	//This else clause is here because openssl 1.0.1k does not handle NULL pointers
+			ret = posix_memalign((void **)&test.P, 64, 16);
+			ret |= posix_memalign((void **)&test.C, 64, 16);
+		}
+		if (ret != 0) {
+			printf("posix_memalign for testsize:0x%x failed\n", Plen);
+			return 1;
+		}
+		test.K = malloc(GCM_128_KEY_LEN + offset);
+		test.Klen = GCM_128_KEY_LEN;
+		test.IV = malloc(GCM_IV_DATA_LEN + offset);
+		test.IVlen = GCM_IV_DATA_LEN;
+		test.A = malloc(aad_len + offset);
+
+		test.Alen = aad_len;
+		test.T = malloc(MAX_TAG_LEN + offset);
+
+		if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+		    || (NULL == test.IV)) {
+			printf("malloc of testsize:0x%x failed\n", Plen);
+			return 1;
+		}
+
+		test.P += offset;
+		test.C += offset;
+		test.K += offset;
+		test.IV += offset;
+		test.A += offset;
+		test.T += offset;
+
+		mk_rand_data(test.P, test.Plen);
+		mk_rand_data(test.K, test.Klen);
+		mk_rand_data(test.IV, test.IVlen);
+		mk_rand_data(test.A, test.Alen);
+
+		// single Key length of 128bits/16bytes supported
+		// single IV length of 96bits/12bytes supported
+		// Tag lengths of 8, 12 or 16
+		for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+			test.Tlen = tag_len;
+			if (0 != check_strm_vector2(gkey, gctx, &test, length, start, breaks))
+				return 1;
+			tag_len += 4;	//supported lengths are 8, 12 or 16
+		}
+		test.A -= offset;
+		free(test.A);
+		test.C -= offset;
+		aligned_free(test.C);
+		test.IV -= offset;
+		free(test.IV);
+		test.K -= offset;
+		free(test.K);
+		test.P -= offset;
+		aligned_free(test.P);
+		test.T -= offset;
+		free(test.T);
+	}
+	printf("\n");
+	free(gkey);
+	free(gctx);
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	int errors = 0;
+	int seed;
+
+	if (argc == 1)
+		seed = TEST_SEED;
+	else
+		seed = atoi(argv[1]);
+
+	srand(seed);
+	printf("SEED: %d\n", seed);
+
+	errors += test_gcm_std_vectors();
+	errors += test_gcm256_combinations();
+	errors += test_gcm_combinations();
+	errors += test_gcm_efence();
+	errors += test_gcm256_strm_combinations(TEST_LEN);
+	errors += test_gcm_strm_combinations(TEST_LEN);
+	errors += test_gcm256_strm_combinations(1024);
+	errors += test_gcm_strm_combinations(1024);
+	errors += test_gcm_strm_efence();
+	errors += test_gcm_strm_combinations2(1024, 0, 1024);
+
+	if (0 == errors)
+		printf("...Pass\n");
+	else
+		printf("...Fail\n");
+
+	return errors;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_std_vectors_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_std_vectors_test.c
new file mode 100644
index 000000000..19c0cc447
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_std_vectors_test.c
@@ -0,0 +1,322 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>		// for memcmp
+#include <aes_gcm.h>
+#include "gcm_vectors.h"
+#include "types.h"
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+	int mismatch;
+	int OK = 0;
+
+	mismatch = memcmp(test, expected, len);
+	if (mismatch) {
+		OK = 1;
+		printf("  expected results don't match %s \t\t", data_name);
+		{
+			uint64_t a;
+			for (a = 0; a < len; a++) {
+				if (test[a] != expected[a]) {
+					printf(" '%x' != '%x' at %lx of %lx\n",
+					       test[a], expected[a], a, len);
+					break;
+				}
+			}
+		}
+	}
+	return OK;
+}
+
+int test_gcm128_std_vectors_nt(gcm_vector const *vector)
+{
+	struct gcm_key_data gkey;
+	struct gcm_context_data gctx;
+	int OK = 0;
+	// Temporary array for the calculated vectors
+	uint8_t *ct_test = NULL;
+	uint8_t *pt_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *T2_test = NULL;
+	uint64_t IV_alloc_len = 0;
+	int ret;
+
+	// Allocate space for the calculated ciphertext
+	ret = posix_memalign((void **)&ct_test, 32, vector->Plen);
+	// Allocate space for the calculated plaintext
+	ret |= posix_memalign((void **)&pt_test, 32, vector->Plen);
+	if ((ret != 0) || (ct_test == NULL) || (pt_test == NULL)) {
+		fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+		return 1;
+	}
+	IV_alloc_len = vector->IVlen;
+	// Allocate space for the calculated ciphertext
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	T2_test = malloc(vector->Tlen);
+	if ((T_test == NULL) || (T2_test == NULL)) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_128(vector->K, &gkey);
+
+	////
+	// ISA-l Encrypt
+	////
+	memset(ct_test, 0, vector->Plen);
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+	// test of in-place encrypt
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_enc_128_nt(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+			   vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(pt_test, vector->C, vector->Plen,
+			 "ISA-L encrypted cypher text(in-place)");
+	memset(ct_test, 0, vector->Plen);
+	memset(T_test, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+	memcpy(ct_test, vector->C, vector->Plen);
+	aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	// GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+	// test in in-place decrypt
+	memcpy(ct_test, vector->C, vector->Plen);
+	aes_gcm_dec_128_nt(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+			   vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+	OK |=
+	    check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+	// ISA-L enc -> ISA-L dec
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	memset(pt_test, 0, vector->Plen);
+	aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+			   vector->A, vector->Alen, T2_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "ISA-L self decrypted plain text (P)");
+	OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+
+	memset(pt_test, 0, vector->Plen);
+
+	if (NULL != ct_test)
+		aligned_free(ct_test);
+	if (NULL != pt_test)
+		aligned_free(pt_test);
+	if (NULL != IV_c)
+		free(IV_c);
+	if (NULL != T_test)
+		free(T_test);
+	if (NULL != T2_test)
+		free(T2_test);
+
+	return OK;
+}
+
+int test_gcm256_std_vectors_nt(gcm_vector const *vector)
+{
+	struct gcm_key_data gkey;
+	struct gcm_context_data gctx;
+	int OK = 0;
+	// Temporary array for the calculated vectors
+	uint8_t *ct_test = NULL;
+	uint8_t *pt_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *T2_test = NULL;
+	uint64_t IV_alloc_len = 0;
+	int ret;
+
+	// Allocate space for the calculated ciphertext
+	ret = posix_memalign((void **)&ct_test, 32, vector->Plen);
+	// Allocate space for the calculated plaintext
+	ret |= posix_memalign((void **)&pt_test, 32, vector->Plen);
+	if ((ret != 0) || (ct_test == NULL) || (pt_test == NULL)) {
+		fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+		return 1;
+	}
+	IV_alloc_len = vector->IVlen;
+	// Allocate space for the calculated ciphertext
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	T2_test = malloc(vector->Tlen);
+	if (T_test == NULL) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_256(vector->K, &gkey);
+
+	////
+	// ISA-l Encrypt
+	////
+	memset(ct_test, 0, vector->Plen);
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+	// test of in-place encrypt
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_enc_256_nt(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+			   vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->C, vector->Plen,
+		       "ISA-L encrypted cypher text(in-place)");
+	memset(ct_test, 0, vector->Plen);
+	memset(T_test, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+	memset(pt_test, 0, vector->Plen);
+	memcpy(ct_test, vector->C, vector->Plen);
+	aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	// GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+	// test in in-place decrypt
+	memcpy(ct_test, vector->C, vector->Plen);
+	aes_gcm_dec_256_nt(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+			   vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+	OK |=
+	    check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+	// ISA-L enc -> ISA-L dec
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+			   IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	memset(pt_test, 0, vector->Plen);
+	aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+			   vector->A, vector->Alen, T2_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "ISA-L self decrypted plain text (P)");
+	OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+
+	if (NULL != ct_test)
+		aligned_free(ct_test);
+	if (NULL != pt_test)
+		aligned_free(pt_test);
+	if (NULL != IV_c)
+		free(IV_c);
+	if (NULL != T_test)
+		free(T_test);
+	if (NULL != T2_test)
+		free(T2_test);
+
+	return OK;
+}
+
+int test_gcm_std_vectors_nt(void)
+{
+	int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]);
+	int vect;
+	int OK = 0;
+
+	printf("AES-GCM standard test vectors NT:\n");
+	for (vect = 0; (vect < vectors_cnt); vect++) {
+#ifdef DEBUG
+		printf("Standard vector NT %d/%d"
+		       "  Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+		       vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen,
+		       (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen,
+		       (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen);
+#else
+		printf(".");
+#endif
+		if (BITS_128 == gcm_vectors[vect].Klen)
+			OK |= test_gcm128_std_vectors_nt(&gcm_vectors[vect]);
+		else
+			OK |= test_gcm256_std_vectors_nt(&gcm_vectors[vect]);
+		if (0 != OK)
+			return OK;
+	}
+	printf("\n");
+	return OK;
+}
+
+int main(int argc, char **argv)
+{
+	int errors = 0;
+	int seed;
+
+	if (argc == 1)
+		seed = TEST_SEED;
+	else
+		seed = atoi(argv[1]);
+
+	srand(seed);
+	printf("SEED: %d\n", seed);
+
+	errors += test_gcm_std_vectors_nt();
+
+	if (0 == errors)
+		printf("...Pass\n");
+	else
+		printf("...Fail\n");
+
+	return errors;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c
new file mode 100644
index 000000000..a9e9c5914
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c
@@ -0,0 +1,272 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>		// for rand
+#include <string.h>		// for memcmp
+#include <aes_gcm.h>
+#include <test.h>
+#include "ossl_helper.h"
+#include "gcm_vectors.h"
+
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     (2 * GT_L3_CACHE)
+#  define TEST_LOOPS   50
+#  define TEST_TYPE_STR "_cold"
+#endif
+
+#define AAD_LENGTH   16
+#define TEST_MEM TEST_LEN
+
+static unsigned char *plaintext, *gcm_plaintext, *cyphertext, *ossl_plaintext,
+    *ossl_cyphertext, *gcm_tag, *ossl_tag, *IV, *AAD;
+static uint8_t key128[GCM_128_KEY_LEN];
+static uint8_t key256[GCM_256_KEY_LEN];
+uint8_t iv_len = 0;
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+	unsigned int i;
+	for (i = 0; i < size; i++) {
+		*data++ = rand();
+	}
+}
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, int vect, char *data_name)
+{
+	int mismatch;
+	int OK = 1;
+
+	mismatch = memcmp(test, expected, len);
+	if (mismatch) {
+		OK = 0;
+		printf("  v[%d] expected results don't match %s \t\t", vect, data_name);
+		{
+			uint64_t a;
+			for (a = 0; a < len; a++) {
+				if (test[a] != expected[a]) {
+					printf(" '%x' != '%x' at %lx of %lx\n",
+					       test[a], expected[a], a, len);
+					break;
+				}
+			}
+		}
+	}
+	return OK;
+}
+
+void aes_gcm_perf(void)
+{
+	struct gcm_key_data gkey, gkey256;
+	struct gcm_context_data gctx;
+	int i;
+
+	printf
+	    ("AES GCM performance parameters plain text length:%d; IV length:%d; ADD length:%d \n",
+	     TEST_LEN, GCM_IV_LEN, AAD_LENGTH);
+
+	mk_rand_data(key128, sizeof(key128));
+	mk_rand_data(key256, sizeof(key256));
+
+	// This is only required once for a given key
+	aes_gcm_pre_128(key128, &gkey);
+	aes_gcm_pre_256(key256, &gkey256);
+
+	// Preload code cache
+	aes_gcm_enc_128(&gkey, &gctx, cyphertext, plaintext, TEST_LEN, IV, AAD, AAD_LENGTH,
+			gcm_tag, MAX_TAG_LEN);
+	openssl_aes_gcm_enc(key128, IV, iv_len, AAD, AAD_LENGTH, ossl_tag, MAX_TAG_LEN,
+			    plaintext, TEST_LEN, ossl_cyphertext);
+	check_data(cyphertext, ossl_cyphertext, TEST_LEN, 0,
+		   "ISA-L vs OpenSSL 128 key cypher text (C)");
+	check_data(gcm_tag, ossl_tag, MAX_TAG_LEN, 0, "ISA-L vs OpenSSL 128 tag (T)");
+	aes_gcm_enc_256(&gkey256, &gctx, cyphertext, plaintext, TEST_LEN, IV, AAD, AAD_LENGTH,
+			gcm_tag, MAX_TAG_LEN);
+	openssl_aes_256_gcm_enc(key256, IV, iv_len, AAD, AAD_LENGTH, ossl_tag, MAX_TAG_LEN,
+				plaintext, TEST_LEN, ossl_cyphertext);
+	check_data(cyphertext, ossl_cyphertext, TEST_LEN, 0,
+		   "ISA-L vs OpenSSL 256 cypher text (C)");
+	check_data(gcm_tag, ossl_tag, MAX_TAG_LEN, 0, "ISA-L vs OpenSSL 256 tag (T)");
+
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			aes_gcm_enc_128(&gkey, &gctx, cyphertext, plaintext, TEST_LEN, IV, AAD,
+					AAD_LENGTH, gcm_tag, MAX_TAG_LEN);
+		}
+
+		perf_stop(&stop);
+		printf("        aes_gcm_enc" TEST_TYPE_STR ":\t");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			openssl_aes_gcm_enc(key128, IV, iv_len, AAD, AAD_LENGTH,
+					    ossl_tag, MAX_TAG_LEN, plaintext, TEST_LEN,
+					    cyphertext);
+		}
+
+		perf_stop(&stop);
+		printf("openssl_aes_gcm_enc" TEST_TYPE_STR ":\t");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			aes_gcm_dec_128(&gkey, &gctx, plaintext, cyphertext, TEST_LEN, IV,
+					AAD, AAD_LENGTH, gcm_tag, MAX_TAG_LEN);
+			check_data(gcm_tag, gcm_tag, MAX_TAG_LEN, 0, "ISA-L check of tag (T)");
+		}
+
+		perf_stop(&stop);
+		printf("        aes_gcm_dec" TEST_TYPE_STR ":\t");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			openssl_aes_gcm_dec(key128, IV, iv_len, AAD, AAD_LENGTH,
+					    ossl_tag, MAX_TAG_LEN, cyphertext, TEST_LEN,
+					    plaintext);
+		}
+
+		perf_stop(&stop);
+		printf("openssl_aes_gcm_dec" TEST_TYPE_STR ":\t");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+
+	printf("\n");
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			aes_gcm_enc_256(&gkey256, &gctx, cyphertext, plaintext, TEST_LEN, IV,
+					AAD, AAD_LENGTH, gcm_tag, MAX_TAG_LEN);
+		}
+
+		perf_stop(&stop);
+		printf("         aes_gcm256_enc" TEST_TYPE_STR ":\t");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			openssl_aes_256_gcm_enc(key256, IV, iv_len, AAD, AAD_LENGTH,
+						ossl_tag, MAX_TAG_LEN, plaintext, TEST_LEN,
+						cyphertext);
+		}
+
+		perf_stop(&stop);
+		printf("openssl_aes_256_gcm_enc" TEST_TYPE_STR ":\t");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			aes_gcm_dec_256(&gkey256, &gctx, plaintext, cyphertext, TEST_LEN, IV,
+					AAD, AAD_LENGTH, gcm_tag, MAX_TAG_LEN);
+			check_data(gcm_tag, gcm_tag, MAX_TAG_LEN, 0,
+				   "ISA-L check of 256 tag (T)");
+		}
+
+		perf_stop(&stop);
+		printf("         aes_gcm256_dec" TEST_TYPE_STR ":\t");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+	{
+		struct perf start, stop;
+
+		perf_start(&start);
+		for (i = 0; i < TEST_LOOPS; i++) {
+			openssl_aes_256_gcm_dec(key256, IV, iv_len, AAD, AAD_LENGTH,
+						ossl_tag, MAX_TAG_LEN, cyphertext, TEST_LEN,
+						plaintext);
+		}
+
+		perf_stop(&stop);
+		printf("openssl_aes_256_gcm_dec" TEST_TYPE_STR ":\t");
+		perf_print(stop, start, (long long)TEST_LEN * i);
+	}
+}
+
+int main(void)
+{
+	uint8_t const IVend[] = GCM_IV_END_MARK;
+	uint32_t OK = 1;
+
+	plaintext = malloc(TEST_LEN);
+	gcm_plaintext = malloc(TEST_LEN);
+	cyphertext = malloc(TEST_LEN);
+	ossl_plaintext = malloc(TEST_LEN + 16);
+	ossl_cyphertext = malloc(TEST_LEN);
+	gcm_tag = malloc(MAX_TAG_LEN);
+	ossl_tag = malloc(MAX_TAG_LEN);
+	AAD = malloc(AAD_LENGTH);
+	IV = malloc(GCM_IV_LEN);
+	if ((NULL == plaintext) || (NULL == cyphertext) || (NULL == gcm_plaintext)
+	    || (NULL == ossl_plaintext) || (NULL == ossl_cyphertext)
+	    || (NULL == gcm_tag) || (NULL == ossl_tag) || (NULL == AAD) || (NULL == IV)) {
+		printf("malloc of testsize:0x%x failed\n", TEST_LEN);
+		return -1;
+	}
+
+	mk_rand_data(plaintext, TEST_LEN);
+	mk_rand_data(AAD, AAD_LENGTH);
+	mk_rand_data(IV, GCM_IV_LEN);
+	memcpy(&IV[GCM_IV_END_START], IVend, sizeof(IVend));
+	iv_len = GCM_IV_LEN - sizeof(IVend);	//end marker not part of IV length
+
+	aes_gcm_perf();
+	printf("AES gcm ISA-L vs OpenSSL performance\n");
+
+	return !OK;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c
new file mode 100644
index 000000000..ee064ef6c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c
@@ -0,0 +1,61 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <aes_gcm.h>
+#include <aes_keyexp.h>
+
+void aes_keyexp_128_enc(const void *, uint8_t *);
+void aes_gcm_precomp_128(struct gcm_key_data *key_data);
+void aes_gcm_precomp_256(struct gcm_key_data *key_data);
+
+void aes_gcm_pre_128(const void *key, struct gcm_key_data *key_data)
+{
+	aes_keyexp_128_enc(key, key_data->expanded_keys);
+	aes_gcm_precomp_128(key_data);
+}
+
+void aes_gcm_pre_256(const void *key, struct gcm_key_data *key_data)
+{
+	uint8_t tmp_exp_key[GCM_ENC_KEY_LEN * GCM_KEY_SETS];
+	aes_keyexp_256((const uint8_t *)key, (uint8_t *) key_data->expanded_keys, tmp_exp_key);
+	aes_gcm_precomp_256(key_data);
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+
+// Version info
+struct slver aes_gcm_pre_128_slver_000002c7;
+struct slver aes_gcm_pre_128_slver = { 0x02c7, 0x00, 0x00 };
+
+struct slver aes_gcm_pre_256_slver_000002d7;
+struct slver aes_gcm_pre_256_slver = { 0x02d7, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_simple_example.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_simple_example.c
new file mode 100644
index 000000000..4b7ca9736
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_simple_example.c
@@ -0,0 +1,78 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include "aes_gcm.h"
+
+#define TXT_SIZE  8
+#define AAD_SIZE 32
+#define TAG_SIZE 16		/* Valid values are 16, 12, or 8 */
+#define KEY_SIZE GCM_256_KEY_LEN
+#define IV_SIZE  GCM_IV_DATA_LEN
+
+void mprint(const char *msg, uint8_t * buf, int len)
+{
+	int i;
+	printf("%s", msg);
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+int main(void)
+{
+	struct gcm_key_data gkey;
+	struct gcm_context_data gctx;
+	uint8_t ct[TXT_SIZE], pt[TXT_SIZE], pt2[TXT_SIZE];	// Cipher text and plain text
+	uint8_t iv[IV_SIZE], aad[AAD_SIZE], key[KEY_SIZE];	// Key and authentication data
+	uint8_t tag1[TAG_SIZE], tag2[TAG_SIZE];	// Authentication tags for encode and decode
+
+	printf("gcm example:\n");
+	memset(key, 0, KEY_SIZE);
+	memset(pt, 0, TXT_SIZE);
+	memset(iv, 0, IV_SIZE);
+	memset(aad, 0, AAD_SIZE);
+
+	aes_gcm_pre_256(key, &gkey);
+	aes_gcm_enc_256(&gkey, &gctx, ct, pt, TXT_SIZE, iv, aad, AAD_SIZE, tag1, TAG_SIZE);
+	aes_gcm_dec_256(&gkey, &gctx, pt2, ct, TXT_SIZE, iv, aad, AAD_SIZE, tag2, TAG_SIZE);
+
+	mprint("  input text:     ", pt, TXT_SIZE);
+	mprint("  cipher text:    ", ct, TXT_SIZE);
+	mprint("  decode text:    ", pt2, TXT_SIZE);
+	mprint("  ath tag1 (enc): ", tag1, TAG_SIZE);
+	mprint("  ath tag2 (dec): ", tag2, TAG_SIZE);
+
+	return memcmp(tag1, tag2, TAG_SIZE);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_sse.asm
new file mode 100644
index 000000000..e35860496
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_sse.asm
@@ -0,0 +1,2171 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Authors:
+;       Erdinc Ozturk
+;       Vinodh Gopal
+;       James Guilford
+;
+;
+; References:
+;       This code was derived and highly optimized from the code described in paper:
+;               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+;       For the shift-based reductions used in this code, we used the method described in paper:
+;               Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+;       0                   1                   2                   3
+;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                             Salt  (From the SA)               |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                     Initialization Vector                     |
+;       |         (This is the sequence number from IPSec header)       |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                              0x1                              |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+;       AAD will be padded with 0 to the next 16byte multiple
+;       for example, assume AAD is a u32 vector
+;
+;       if AAD is 8 bytes:
+;       AAD[3] = {A0, A1};
+;       padded AAD in xmm register = {A1 A0 0 0}
+;
+;       0                   1                   2                   3
+;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                               SPI (A1)                        |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                     32-bit Sequence Number (A0)               |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                              0x0                              |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;                                       AAD Format with 32-bit Sequence Number
+;
+;       if AAD is 12 bytes:
+;       AAD[3] = {A0, A1, A2};
+;       padded AAD in xmm register = {A2 A1 A0 0}
+;
+;       0                   1                   2                   3
+;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                               SPI (A2)                        |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                 64-bit Extended Sequence Number {A1,A0}       |
+;       |                                                               |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                              0x0                              |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;        AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+;       Must be a multiple of 4 bytes and from the definition of the spec.
+;       The code additionally supports any aadLen length.
+;
+; TLen:
+;       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+%ifndef GCM128_MODE
+%ifndef GCM192_MODE
+%ifndef GCM256_MODE
+%error "No GCM mode selected for gcm_sse.asm!"
+%endif
+%endif
+%endif
+
+%ifndef FUNCT_EXTENSION
+%define FUNCT_EXTENSION
+%endif
+
+%ifdef GCM128_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ sse %+ FUNCT_EXTENSION
+%define NROUNDS 9
+%endif
+
+%ifdef GCM192_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ sse %+ FUNCT_EXTENSION
+%define NROUNDS 11
+%endif
+
+%ifdef GCM256_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ sse %+ FUNCT_EXTENSION
+%define NROUNDS 13
+%endif
+
+
+default rel
+; need to push 5 registers into stack to maintain
+%define STACK_OFFSET 8*5
+
+%define	TMP2	16*0    ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define	TMP3	16*1    ; Temporary storage for AES State 3
+%define	TMP4	16*2    ; Temporary storage for AES State 4
+%define	TMP5	16*3    ; Temporary storage for AES State 5
+%define	TMP6	16*4    ; Temporary storage for AES State 6
+%define	TMP7	16*5    ; Temporary storage for AES State 7
+%define	TMP8	16*6    ; Temporary storage for AES State 8
+
+%define	LOCAL_STORAGE	16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+	%define	XMM_STORAGE	16*10
+%else
+	%define	XMM_STORAGE	0
+%endif
+
+%define	VARIABLE_OFFSET	LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro  GHASH_MUL  7
+%define %%GH %1         ; 16 Bytes
+%define %%HK %2         ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+        ; %%GH, %%HK hold the values for the two operands which are carry-less multiplied
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ; Karatsuba Method
+        movdqa  %%T1, %%GH
+        pshufd  %%T2, %%GH, 01001110b
+        pshufd  %%T3, %%HK, 01001110b
+        pxor    %%T2, %%GH                              ; %%T2 = (a1+a0)
+        pxor    %%T3, %%HK                              ; %%T3 = (b1+b0)
+
+        pclmulqdq       %%T1, %%HK, 0x11                ; %%T1 = a1*b1
+        pclmulqdq       %%GH, %%HK, 0x00                ; %%GH = a0*b0
+        pclmulqdq       %%T2, %%T3, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+        pxor    %%T2, %%GH
+        pxor    %%T2, %%T1                              ; %%T2 = a0*b1+a1*b0
+
+        movdqa  %%T3, %%T2
+        pslldq  %%T3, 8                                 ; shift-L %%T3 2 DWs
+        psrldq  %%T2, 8                                 ; shift-R %%T2 2 DWs
+        pxor    %%GH, %%T3
+        pxor    %%T1, %%T2                              ; <%%T1:%%GH> holds the result of the carry-less multiplication of %%GH by %%HK
+
+
+        ;first phase of the reduction
+        movdqa  %%T2, %%GH
+        movdqa  %%T3, %%GH
+        movdqa  %%T4, %%GH                              ; move %%GH into %%T2, %%T3, %%T4 in order to perform the three shifts independently
+
+        pslld   %%T2, 31                                ; packed right shifting << 31
+        pslld   %%T3, 30                                ; packed right shifting shift << 30
+        pslld   %%T4, 25                                ; packed right shifting shift << 25
+        pxor    %%T2, %%T3                              ; xor the shifted versions
+        pxor    %%T2, %%T4
+
+        movdqa  %%T5, %%T2
+        psrldq  %%T5, 4                                 ; shift-R %%T5 1 DW
+
+        pslldq  %%T2, 12                                ; shift-L %%T2 3 DWs
+        pxor    %%GH, %%T2                              ; first phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+        ;second phase of the reduction
+        movdqa  %%T2,%%GH                               ; make 3 copies of %%GH (in in %%T2, %%T3, %%T4) for doing three shift operations
+        movdqa  %%T3,%%GH
+        movdqa  %%T4,%%GH
+
+        psrld   %%T2,1                                  ; packed left shifting >> 1
+        psrld   %%T3,2                                  ; packed left shifting >> 2
+        psrld   %%T4,7                                  ; packed left shifting >> 7
+        pxor    %%T2,%%T3                               ; xor the shifted versions
+        pxor    %%T2,%%T4
+
+        pxor    %%T2, %%T5
+        pxor    %%GH, %%T2
+        pxor    %%GH, %%T1                              ; the result is in %%T1
+
+
+%endmacro
+
+
+%macro PRECOMPUTE 8
+%define	%%GDATA	%1
+%define	%%HK	%2
+%define	%%T1	%3
+%define	%%T2	%4
+%define	%%T3	%5
+%define	%%T4	%6
+%define	%%T5	%7
+%define	%%T6	%8
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+        movdqa  %%T4, %%HK
+        pshufd  %%T1, %%HK, 01001110b
+        pxor    %%T1, %%HK
+        movdqu  [%%GDATA + HashKey_k], %%T1
+
+
+        GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6      ;  %%T4 = HashKey^2<<1 mod poly
+        movdqu  [%%GDATA + HashKey_2], %%T4                         ;  [HashKey_2] = HashKey^2<<1 mod poly
+        pshufd  %%T1, %%T4, 01001110b
+        pxor    %%T1, %%T4
+        movdqu  [%%GDATA + HashKey_2_k], %%T1
+
+        GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6              ;  %%T4 = HashKey^3<<1 mod poly
+        movdqu  [%%GDATA + HashKey_3], %%T4
+        pshufd  %%T1, %%T4, 01001110b
+        pxor    %%T1, %%T4
+        movdqu  [%%GDATA + HashKey_3_k], %%T1
+
+
+        GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6              ;  %%T4 = HashKey^4<<1 mod poly
+        movdqu  [%%GDATA + HashKey_4], %%T4
+        pshufd  %%T1, %%T4, 01001110b
+        pxor    %%T1, %%T4
+        movdqu  [%%GDATA + HashKey_4_k], %%T1
+
+        GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6              ;  %%T4 = HashKey^5<<1 mod poly
+        movdqu  [%%GDATA + HashKey_5], %%T4
+        pshufd  %%T1, %%T4, 01001110b
+        pxor    %%T1, %%T4
+        movdqu  [%%GDATA + HashKey_5_k], %%T1
+
+
+        GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6              ;  %%T4 = HashKey^6<<1 mod poly
+        movdqu  [%%GDATA + HashKey_6], %%T4
+        pshufd  %%T1, %%T4, 01001110b
+        pxor    %%T1, %%T4
+        movdqu  [%%GDATA + HashKey_6_k], %%T1
+
+        GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6              ;  %%T4 = HashKey^7<<1 mod poly
+        movdqu  [%%GDATA + HashKey_7], %%T4
+        pshufd  %%T1, %%T4, 01001110b
+        pxor    %%T1, %%T4
+        movdqu  [%%GDATA + HashKey_7_k], %%T1
+
+        GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6              ;  %%T4 = HashKey^8<<1 mod poly
+        movdqu  [%%GDATA + HashKey_8], %%T4
+        pshufd  %%T1, %%T4, 01001110b
+        pxor    %%T1, %%T4
+        movdqu  [%%GDATA + HashKey_8_k], %%T1
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT	6
+%define	%%OUTPUT		%1 ; %%OUTPUT is an xmm register
+%define	%%INPUT			%2
+%define	%%LENGTH		%3
+%define	%%END_READ_LOCATION	%4 ; All this and the lower inputs are temp registers
+%define	%%COUNTER		%5
+%define	%%TMP1			%6
+
+	pxor	%%OUTPUT, %%OUTPUT
+	mov	%%COUNTER, %%LENGTH
+	mov	%%END_READ_LOCATION, %%INPUT
+	add	%%END_READ_LOCATION, %%LENGTH
+	xor	%%TMP1, %%TMP1
+
+
+	cmp	%%COUNTER, 8
+	jl	%%_byte_loop_2
+	pinsrq	%%OUTPUT, [%%INPUT],0		;Read in 8 bytes if they exists
+	je	%%_done
+
+	sub	%%COUNTER, 8
+
+%%_byte_loop_1:					;Read in data 1 byte at a time while data is left
+	shl	%%TMP1, 8			;This loop handles when 8 bytes were already read in
+	dec	%%END_READ_LOCATION
+	mov	BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+	dec	%%COUNTER
+	jg	%%_byte_loop_1
+	pinsrq	%%OUTPUT, %%TMP1, 1
+	jmp	%%_done
+
+%%_byte_loop_2:					;Read in data 1 byte at a time while data is left
+	cmp	%%COUNTER, 0
+	je	%%_done
+	shl	%%TMP1, 8			;This loop handles when no bytes were already read in
+	dec	%%END_READ_LOCATION
+	mov	BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+	dec	%%COUNTER
+	jg	%%_byte_loop_2
+	pinsrq	%%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro	CALC_AAD_HASH	14
+%define	%%A_IN		%1
+%define	%%A_LEN		%2
+%define	%%AAD_HASH	%3
+%define	%%HASH_KEY	%4
+%define	%%XTMP1		%5	; xmm temp reg 5
+%define	%%XTMP2		%6
+%define	%%XTMP3		%7
+%define	%%XTMP4		%8
+%define	%%XTMP5		%9	; xmm temp reg 5
+%define	%%T1		%10	; temp reg 1
+%define	%%T2		%11
+%define	%%T3		%12
+%define	%%T4		%13
+%define	%%T5		%14	; temp reg 5
+
+
+	mov	%%T1, %%A_IN		; T1 = AAD
+	mov	%%T2, %%A_LEN		; T2 = aadLen
+	pxor	%%AAD_HASH, %%AAD_HASH
+
+	cmp	%%T2, 16
+	jl	%%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+	movdqu	%%XTMP1, [%%T1]
+	;byte-reflect the AAD data
+	pshufb	%%XTMP1, [SHUF_MASK]
+	pxor	%%AAD_HASH, %%XTMP1
+	GHASH_MUL	%%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+	sub	%%T2, 16
+	je	%%_CALC_AAD_done
+
+	add	%%T1, 16
+	cmp	%%T2, 16
+	jge	%%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+	READ_SMALL_DATA_INPUT	%%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+	;byte-reflect the AAD data
+	pshufb	%%XTMP1, [SHUF_MASK]
+	pxor	%%AAD_HASH, %%XTMP1
+	GHASH_MUL	%%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET),
+; and whether encoding or decoding (ENC_DEC).
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK	8
+%define	%%GDATA_KEY		%1
+%define	%%GDATA_CTX		%2
+%define	%%CYPH_PLAIN_OUT	%3
+%define	%%PLAIN_CYPH_IN		%4
+%define	%%PLAIN_CYPH_LEN	%5
+%define	%%DATA_OFFSET		%6
+%define	%%AAD_HASH		%7
+%define	%%ENC_DEC		%8
+	mov	r13, [%%GDATA_CTX + PBlockLen]
+	cmp	r13, 0
+	je	%%_partial_block_done		;Leave Macro if no partial blocks
+
+	cmp	%%PLAIN_CYPH_LEN, 16		;Read in input data without over reading
+	jl	%%_fewer_than_16_bytes
+	XLDR	xmm1, [%%PLAIN_CYPH_IN]		;If more than 16 bytes of data, just fill the xmm register
+	jmp	%%_data_read
+
+%%_fewer_than_16_bytes:
+	lea	r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+	READ_SMALL_DATA_INPUT	xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+	mov	r13, [%%GDATA_CTX + PBlockLen]
+
+%%_data_read:				;Finished reading in data
+
+
+	movdqu	xmm9, [%%GDATA_CTX + PBlockEncKey]	;xmm9 = ctx_data.partial_block_enc_key
+	movdqu	xmm13, [%%GDATA_KEY + HashKey]
+
+	lea	r12, [SHIFT_MASK]
+
+	add	r12, r13			; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+	movdqu	xmm2, [r12]			; get the appropriate shuffle mask
+	pshufb	xmm9, xmm2			;shift right r13 bytes
+
+%ifidn	%%ENC_DEC, DEC
+	movdqa	xmm3, xmm1
+	pxor	xmm9, xmm1			; Cyphertext XOR E(K, Yn)
+
+	mov	r15, %%PLAIN_CYPH_LEN
+	add	r15, r13
+	sub	r15, 16				;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+	jge	%%_no_extra_mask_1		;Determine if if partial block is not being filled and shift mask accordingly
+	sub	r12, r15
+%%_no_extra_mask_1:
+
+	movdqu	xmm1, [r12 + ALL_F-SHIFT_MASK]	; get the appropriate mask to mask out bottom r13 bytes of xmm9
+	pand	xmm9, xmm1			; mask out bottom r13 bytes of xmm9
+
+	pand	xmm3, xmm1
+	pshufb	xmm3, [SHUF_MASK]
+	pshufb	xmm3, xmm2
+	pxor	%%AAD_HASH, xmm3
+
+
+	cmp	r15,0
+	jl	%%_partial_incomplete_1
+
+	GHASH_MUL	%%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6	;GHASH computation for the last <16 Byte block
+	xor	rax,rax
+	mov	[%%GDATA_CTX + PBlockLen], rax
+	jmp	%%_dec_done
+%%_partial_incomplete_1:
+	add	[%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%%_dec_done:
+	movdqu	[%%GDATA_CTX + AadHash], %%AAD_HASH
+
+%else
+	pxor	xmm9, xmm1	; Plaintext XOR E(K, Yn)
+
+	mov	r15, %%PLAIN_CYPH_LEN
+	add	r15, r13
+	sub	r15, 16				;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+	jge	%%_no_extra_mask_2		;Determine if if partial block is not being filled and shift mask accordingly
+	sub	r12, r15
+%%_no_extra_mask_2:
+
+	movdqu	xmm1, [r12 + ALL_F-SHIFT_MASK]	; get the appropriate mask to mask out bottom r13 bytes of xmm9
+	pand	xmm9, xmm1			; mask out bottom r13  bytes of xmm9
+
+	pshufb	xmm9, [SHUF_MASK]
+	pshufb	xmm9, xmm2
+	pxor	%%AAD_HASH, xmm9
+
+	cmp	r15,0
+	jl	%%_partial_incomplete_2
+
+	GHASH_MUL	%%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6	;GHASH computation for the last <16 Byte block
+	xor	rax,rax
+	mov	[%%GDATA_CTX + PBlockLen], rax
+	jmp	%%_encode_done
+%%_partial_incomplete_2:
+	add     [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%%_encode_done:
+	movdqu	[%%GDATA_CTX + AadHash], %%AAD_HASH
+
+	pshufb	xmm9, [SHUF_MASK]	; shuffle xmm9 back to output as ciphertext
+	pshufb	xmm9, xmm2
+%endif
+
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	; output encrypted Bytes
+	cmp	r15,0
+	jl	%%_partial_fill
+	mov	r12, r13
+	mov	r13, 16
+	sub	r13, r12			; Set r13 to be the number of bytes to write out
+	jmp	%%_count_set
+%%_partial_fill:
+	mov	r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+	movq	rax, xmm9
+	cmp	r13, 8
+	jle	%%_less_than_8_bytes_left
+
+	mov	[%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+	add	%%DATA_OFFSET, 8
+	psrldq	xmm9, 8
+	movq	rax, xmm9
+	sub	r13, 8
+%%_less_than_8_bytes_left:
+	mov	BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+	add	%%DATA_OFFSET, 1
+	shr	rax, 8
+	sub	r13, 1
+	jne	%%_less_than_8_bytes_left
+         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 24
+%define	%%GDATA_KEY		%1
+%define	%%GDATA_CTX		%2
+%define	%%CYPH_PLAIN_OUT	%3
+%define	%%PLAIN_CYPH_IN		%4
+%define	%%LENGTH		%5
+%define	%%DATA_OFFSET		%6
+%define	%%num_initial_blocks	%7	; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define	%%T1		%8
+%define	%%HASH_KEY	%9
+%define	%%T3		%10
+%define	%%T4		%11
+%define	%%T5		%12
+%define	%%CTR		%13
+%define	%%XMM1		%14
+%define	%%XMM2		%15
+%define	%%XMM3		%16
+%define	%%XMM4		%17
+%define	%%XMM5		%18
+%define	%%XMM6		%19
+%define	%%XMM7		%20
+%define	%%XMM8		%21
+%define	%%T6		%22
+%define	%%T_key		%23
+%define	%%ENC_DEC	%24
+
+%assign i       (8-%%num_initial_blocks)
+		movdqu	reg(i), %%XMM8	; move AAD_HASH to temp reg
+
+	        ; start AES for %%num_initial_blocks blocks
+	        movdqu  %%CTR, [%%GDATA_CTX + CurCount]	; %%CTR = Y0
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                paddd   %%CTR, [ONE]           ; INCR Y0
+                movdqa  reg(i), %%CTR
+                pshufb  reg(i), [SHUF_MASK]     ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+movdqu  %%T_key, [%%GDATA_KEY+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                pxor    reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep NROUNDS							; encrypt N blocks with 13 key rounds (11 for GCM192)
+movdqu  %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                aesenc  reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+movdqu  %%T_key, [%%GDATA_KEY+16*j]				; encrypt with last (14th) key round (12 for GCM192)
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                aesenclast      reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+                XLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+                pxor    reg(i), %%T1
+                XSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)            ; write back ciphertext for %%num_initial_blocks blocks
+                add     %%DATA_OFFSET, 16
+                %ifidn  %%ENC_DEC, DEC
+                movdqa  reg(i), %%T1
+                %endif
+                pshufb  reg(i), [SHUF_MASK]     ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+        pxor    reg(j), reg(i)
+        GHASH_MUL       reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6      ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+        ; %%XMM8 has the current Hash Value
+        movdqa  %%T3, %%XMM8
+
+        cmp     %%LENGTH, 128
+        jl      %%_initial_blocks_done                  ; no need for precomputed constants
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+                paddd   %%CTR, [ONE]                   ; INCR Y0
+                movdqa  %%XMM1, %%CTR
+                pshufb  %%XMM1, [SHUF_MASK]             ; perform a 16Byte swap
+
+                paddd   %%CTR, [ONE]                   ; INCR Y0
+                movdqa  %%XMM2, %%CTR
+                pshufb  %%XMM2, [SHUF_MASK]             ; perform a 16Byte swap
+
+                paddd   %%CTR, [ONE]                   ; INCR Y0
+                movdqa  %%XMM3, %%CTR
+                pshufb  %%XMM3, [SHUF_MASK]             ; perform a 16Byte swap
+
+                paddd   %%CTR, [ONE]                   ; INCR Y0
+                movdqa  %%XMM4, %%CTR
+                pshufb  %%XMM4, [SHUF_MASK]             ; perform a 16Byte swap
+
+                paddd   %%CTR, [ONE]                   ; INCR Y0
+                movdqa  %%XMM5, %%CTR
+                pshufb  %%XMM5, [SHUF_MASK]             ; perform a 16Byte swap
+
+                paddd   %%CTR, [ONE]                   ; INCR Y0
+                movdqa  %%XMM6, %%CTR
+                pshufb  %%XMM6, [SHUF_MASK]             ; perform a 16Byte swap
+
+                paddd   %%CTR, [ONE]                   ; INCR Y0
+                movdqa  %%XMM7, %%CTR
+                pshufb  %%XMM7, [SHUF_MASK]             ; perform a 16Byte swap
+
+                paddd   %%CTR, [ONE]                   ; INCR Y0
+                movdqa  %%XMM8, %%CTR
+                pshufb  %%XMM8, [SHUF_MASK]             ; perform a 16Byte swap
+
+                movdqu  %%T_key, [%%GDATA_KEY+16*0]
+                pxor    %%XMM1, %%T_key
+                pxor    %%XMM2, %%T_key
+                pxor    %%XMM3, %%T_key
+                pxor    %%XMM4, %%T_key
+                pxor    %%XMM5, %%T_key
+                pxor    %%XMM6, %%T_key
+                pxor    %%XMM7, %%T_key
+                pxor    %%XMM8, %%T_key
+
+
+%assign i 1
+%rep    NROUNDS       						; do early (13) rounds (11 for GCM192)
+                movdqu  %%T_key, [%%GDATA_KEY+16*i]
+                aesenc  %%XMM1, %%T_key
+                aesenc  %%XMM2, %%T_key
+                aesenc  %%XMM3, %%T_key
+                aesenc  %%XMM4, %%T_key
+                aesenc  %%XMM5, %%T_key
+                aesenc  %%XMM6, %%T_key
+                aesenc  %%XMM7, %%T_key
+                aesenc  %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+                movdqu          %%T_key, [%%GDATA_KEY+16*i]		; do final key round
+                aesenclast      %%XMM1, %%T_key
+                aesenclast      %%XMM2, %%T_key
+                aesenclast      %%XMM3, %%T_key
+                aesenclast      %%XMM4, %%T_key
+                aesenclast      %%XMM5, %%T_key
+                aesenclast      %%XMM6, %%T_key
+                aesenclast      %%XMM7, %%T_key
+                aesenclast      %%XMM8, %%T_key
+
+                XLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+                pxor    %%XMM1, %%T1
+                XSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+                %ifidn  %%ENC_DEC, DEC
+                movdqa  %%XMM1, %%T1
+                %endif
+
+                XLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+                pxor    %%XMM2, %%T1
+                XSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+                %ifidn  %%ENC_DEC, DEC
+                movdqa  %%XMM2, %%T1
+                %endif
+
+                XLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+                pxor    %%XMM3, %%T1
+                XSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+                %ifidn  %%ENC_DEC, DEC
+                movdqa  %%XMM3, %%T1
+                %endif
+
+                XLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+                pxor    %%XMM4, %%T1
+                XSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+                %ifidn  %%ENC_DEC, DEC
+                movdqa  %%XMM4, %%T1
+                %endif
+
+                XLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+                pxor    %%XMM5, %%T1
+                XSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+                %ifidn  %%ENC_DEC, DEC
+                movdqa  %%XMM5, %%T1
+                %endif
+
+                XLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+                pxor    %%XMM6, %%T1
+                XSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+                %ifidn  %%ENC_DEC, DEC
+                movdqa  %%XMM6, %%T1
+                %endif
+
+                XLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+                pxor    %%XMM7, %%T1
+                XSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+                %ifidn  %%ENC_DEC, DEC
+                movdqa  %%XMM7, %%T1
+                %endif
+
+                XLDR  %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+                pxor    %%XMM8, %%T1
+                XSTR  [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+                %ifidn  %%ENC_DEC, DEC
+                movdqa  %%XMM8, %%T1
+                %endif
+
+                add     %%DATA_OFFSET, 128
+
+                pshufb  %%XMM1, [SHUF_MASK]             ; perform a 16Byte swap
+                pxor    %%XMM1, %%T3                    ; combine GHASHed value with the corresponding ciphertext
+                pshufb  %%XMM2, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM3, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM4, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM5, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM6, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM7, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM8, [SHUF_MASK]             ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; %%DATA_OFFSET is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define	%%GDATA			%1
+%define	%%CYPH_PLAIN_OUT	%2
+%define	%%PLAIN_CYPH_IN		%3
+%define	%%DATA_OFFSET		%4
+%define	%%T1	%5
+%define	%%T2	%6
+%define	%%T3	%7
+%define	%%T4	%8
+%define	%%T5	%9
+%define	%%T6	%10
+%define	%%CTR	%11
+%define	%%XMM1	%12
+%define	%%XMM2	%13
+%define	%%XMM3	%14
+%define	%%XMM4	%15
+%define	%%XMM5	%16
+%define	%%XMM6	%17
+%define	%%XMM7	%18
+%define	%%XMM8	%19
+%define	%%T7	%20
+%define	%%loop_idx	%21
+%define	%%ENC_DEC	%22
+
+        movdqa  %%T7, %%XMM1
+        movdqu  [rsp + TMP2], %%XMM2
+        movdqu  [rsp + TMP3], %%XMM3
+        movdqu  [rsp + TMP4], %%XMM4
+        movdqu  [rsp + TMP5], %%XMM5
+        movdqu  [rsp + TMP6], %%XMM6
+        movdqu  [rsp + TMP7], %%XMM7
+        movdqu  [rsp + TMP8], %%XMM8
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; Karatsuba Method
+
+        movdqa  %%T4, %%T7
+        pshufd  %%T6, %%T7, 01001110b
+        pxor    %%T6, %%T7
+                %ifidn %%loop_idx, in_order
+                paddd  %%CTR, [ONE]                    ; INCR CNT
+                %else
+                paddd  %%CTR, [ONEf]                   ; INCR CNT
+                %endif
+        movdqu  %%T5, [%%GDATA + HashKey_8]
+        pclmulqdq       %%T4, %%T5, 0x11                        ; %%T1 = a1*b1
+        pclmulqdq       %%T7, %%T5, 0x00                        ; %%T7 = a0*b0
+        movdqu  %%T5, [%%GDATA + HashKey_8_k]
+        pclmulqdq       %%T6, %%T5, 0x00                        ; %%T2 = (a1+a0)*(b1+b0)
+                movdqa %%XMM1, %%CTR
+
+                %ifidn %%loop_idx, in_order
+                paddd  %%CTR, [ONE]                    ; INCR CNT
+                movdqa %%XMM2, %%CTR
+
+                paddd  %%CTR, [ONE]                    ; INCR CNT
+                movdqa %%XMM3, %%CTR
+
+                paddd  %%CTR, [ONE]                    ; INCR CNT
+                movdqa %%XMM4, %%CTR
+
+                paddd  %%CTR, [ONE]                    ; INCR CNT
+                movdqa %%XMM5, %%CTR
+
+                paddd  %%CTR, [ONE]                    ; INCR CNT
+                movdqa %%XMM6, %%CTR
+
+                paddd  %%CTR, [ONE]                    ; INCR CNT
+                movdqa %%XMM7, %%CTR
+
+                paddd  %%CTR, [ONE]                    ; INCR CNT
+                movdqa %%XMM8, %%CTR
+
+                pshufb  %%XMM1, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM2, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM3, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM4, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM5, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM6, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM7, [SHUF_MASK]             ; perform a 16Byte swap
+                pshufb  %%XMM8, [SHUF_MASK]             ; perform a 16Byte swap
+                %else
+                paddd  %%CTR, [ONEf]                   ; INCR CNT
+                movdqa %%XMM2, %%CTR
+
+                paddd  %%CTR, [ONEf]                   ; INCR CNT
+                movdqa %%XMM3, %%CTR
+
+                paddd  %%CTR, [ONEf]                   ; INCR CNT
+                movdqa %%XMM4, %%CTR
+
+                paddd  %%CTR, [ONEf]                   ; INCR CNT
+                movdqa %%XMM5, %%CTR
+
+                paddd  %%CTR, [ONEf]                   ; INCR CNT
+                movdqa %%XMM6, %%CTR
+
+                paddd  %%CTR, [ONEf]                   ; INCR CNT
+                movdqa %%XMM7, %%CTR
+
+                paddd  %%CTR, [ONEf]                   ; INCR CNT
+                movdqa %%XMM8, %%CTR
+                %endif
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+                movdqu  %%T1, [%%GDATA + 16*0]
+                pxor    %%XMM1, %%T1
+                pxor    %%XMM2, %%T1
+                pxor    %%XMM3, %%T1
+                pxor    %%XMM4, %%T1
+                pxor    %%XMM5, %%T1
+                pxor    %%XMM6, %%T1
+                pxor    %%XMM7, %%T1
+                pxor    %%XMM8, %%T1
+
+        ;; %%XMM6, %%T5 hold the values for the two operands which are carry-less multiplied
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; Karatsuba Method
+        movdqu  %%T1, [rsp + TMP2]
+        movdqa  %%T3, %%T1
+
+        pshufd  %%T2, %%T3, 01001110b
+        pxor    %%T2, %%T3
+        movdqu  %%T5, [%%GDATA + HashKey_7]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+        pclmulqdq       %%T3, %%T5, 0x00                ; %%T3 = a0*b0
+        movdqu  %%T5, [%%GDATA + HashKey_7_k]
+        pclmulqdq       %%T2, %%T5, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+        pxor    %%T4, %%T1                              ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+        pxor    %%T7, %%T3
+        pxor    %%T6, %%T2
+
+                movdqu  %%T1, [%%GDATA + 16*1]
+                aesenc  %%XMM1, %%T1
+                aesenc  %%XMM2, %%T1
+                aesenc  %%XMM3, %%T1
+                aesenc  %%XMM4, %%T1
+                aesenc  %%XMM5, %%T1
+                aesenc  %%XMM6, %%T1
+                aesenc  %%XMM7, %%T1
+                aesenc  %%XMM8, %%T1
+
+
+                movdqu  %%T1, [%%GDATA + 16*2]
+                aesenc  %%XMM1, %%T1
+                aesenc  %%XMM2, %%T1
+                aesenc  %%XMM3, %%T1
+                aesenc  %%XMM4, %%T1
+                aesenc  %%XMM5, %%T1
+                aesenc  %%XMM6, %%T1
+                aesenc  %%XMM7, %%T1
+                aesenc  %%XMM8, %%T1
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ; Karatsuba Method
+        movdqu  %%T1, [rsp + TMP3]
+        movdqa  %%T3, %%T1
+        pshufd  %%T2, %%T3, 01001110b
+        pxor    %%T2, %%T3
+        movdqu  %%T5, [%%GDATA + HashKey_6]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+        pclmulqdq       %%T3, %%T5, 0x00                ; %%T3 = a0*b0
+        movdqu  %%T5, [%%GDATA + HashKey_6_k]
+        pclmulqdq       %%T2, %%T5, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+        pxor    %%T4, %%T1                              ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+        pxor    %%T7, %%T3
+        pxor    %%T6, %%T2
+
+                movdqu  %%T1, [%%GDATA + 16*3]
+                aesenc  %%XMM1, %%T1
+                aesenc  %%XMM2, %%T1
+                aesenc  %%XMM3, %%T1
+                aesenc  %%XMM4, %%T1
+                aesenc  %%XMM5, %%T1
+                aesenc  %%XMM6, %%T1
+                aesenc  %%XMM7, %%T1
+                aesenc  %%XMM8, %%T1
+
+        movdqu  %%T1, [rsp + TMP4]
+        movdqa  %%T3, %%T1
+        pshufd  %%T2, %%T3, 01001110b
+        pxor    %%T2, %%T3
+        movdqu  %%T5, [%%GDATA + HashKey_5]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+        pclmulqdq       %%T3, %%T5, 0x00                ; %%T3 = a0*b0
+        movdqu  %%T5, [%%GDATA + HashKey_5_k]
+        pclmulqdq       %%T2, %%T5, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+        pxor    %%T4, %%T1                              ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+        pxor    %%T7, %%T3
+        pxor    %%T6, %%T2
+
+                movdqu  %%T1, [%%GDATA + 16*4]
+                aesenc  %%XMM1, %%T1
+                aesenc  %%XMM2, %%T1
+                aesenc  %%XMM3, %%T1
+                aesenc  %%XMM4, %%T1
+                aesenc  %%XMM5, %%T1
+                aesenc  %%XMM6, %%T1
+                aesenc  %%XMM7, %%T1
+                aesenc  %%XMM8, %%T1
+
+                movdqu  %%T1, [%%GDATA + 16*5]
+                aesenc  %%XMM1, %%T1
+                aesenc  %%XMM2, %%T1
+                aesenc  %%XMM3, %%T1
+                aesenc  %%XMM4, %%T1
+                aesenc  %%XMM5, %%T1
+                aesenc  %%XMM6, %%T1
+                aesenc  %%XMM7, %%T1
+                aesenc  %%XMM8, %%T1
+
+        movdqu  %%T1, [rsp + TMP5]
+        movdqa  %%T3, %%T1
+        pshufd  %%T2, %%T3, 01001110b
+        pxor    %%T2, %%T3
+        movdqu  %%T5, [%%GDATA + HashKey_4]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+        pclmulqdq       %%T3, %%T5, 0x00                ; %%T3 = a0*b0
+        movdqu  %%T5, [%%GDATA + HashKey_4_k]
+        pclmulqdq       %%T2, %%T5, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+        pxor    %%T4, %%T1                              ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+        pxor    %%T7, %%T3
+        pxor    %%T6, %%T2
+
+
+                movdqu  %%T1, [%%GDATA + 16*6]
+                aesenc  %%XMM1, %%T1
+                aesenc  %%XMM2, %%T1
+                aesenc  %%XMM3, %%T1
+                aesenc  %%XMM4, %%T1
+                aesenc  %%XMM5, %%T1
+                aesenc  %%XMM6, %%T1
+                aesenc  %%XMM7, %%T1
+                aesenc  %%XMM8, %%T1
+        movdqu  %%T1, [rsp + TMP6]
+        movdqa  %%T3, %%T1
+        pshufd  %%T2, %%T3, 01001110b
+        pxor    %%T2, %%T3
+        movdqu  %%T5, [%%GDATA + HashKey_3]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+        pclmulqdq       %%T3, %%T5, 0x00                ; %%T3 = a0*b0
+        movdqu  %%T5, [%%GDATA + HashKey_3_k]
+        pclmulqdq       %%T2, %%T5, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+        pxor    %%T4, %%T1                              ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+        pxor    %%T7, %%T3
+        pxor    %%T6, %%T2
+
+                movdqu  %%T1, [%%GDATA + 16*7]
+                aesenc  %%XMM1, %%T1
+                aesenc  %%XMM2, %%T1
+                aesenc  %%XMM3, %%T1
+                aesenc  %%XMM4, %%T1
+                aesenc  %%XMM5, %%T1
+                aesenc  %%XMM6, %%T1
+                aesenc  %%XMM7, %%T1
+                aesenc  %%XMM8, %%T1
+
+        movdqu  %%T1, [rsp + TMP7]
+        movdqa  %%T3, %%T1
+        pshufd  %%T2, %%T3, 01001110b
+        pxor    %%T2, %%T3
+        movdqu  %%T5, [%%GDATA + HashKey_2]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+        pclmulqdq       %%T3, %%T5, 0x00                ; %%T3 = a0*b0
+        movdqu  %%T5, [%%GDATA + HashKey_2_k]
+        pclmulqdq       %%T2, %%T5, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+        pxor    %%T4, %%T1                              ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+        pxor    %%T7, %%T3
+        pxor    %%T6, %%T2
+
+                movdqu  %%T1, [%%GDATA + 16*8]
+                aesenc  %%XMM1, %%T1
+                aesenc  %%XMM2, %%T1
+                aesenc  %%XMM3, %%T1
+                aesenc  %%XMM4, %%T1
+                aesenc  %%XMM5, %%T1
+                aesenc  %%XMM6, %%T1
+                aesenc  %%XMM7, %%T1
+                aesenc  %%XMM8, %%T1
+
+
+        ;; %%XMM8, %%T5 hold the values for the two operands which are carry-less multiplied
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; Karatsuba Method
+        movdqu  %%T1, [rsp + TMP8]
+        movdqa  %%T3, %%T1
+
+        pshufd  %%T2, %%T3, 01001110b
+        pxor    %%T2, %%T3
+        movdqu  %%T5, [%%GDATA + HashKey]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+        pclmulqdq       %%T3, %%T5, 0x00                ; %%T3 = a0*b0
+        movdqu  %%T5, [%%GDATA + HashKey_k]
+        pclmulqdq       %%T2, %%T5, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+        pxor    %%T7, %%T3
+        pxor    %%T4, %%T1
+
+                movdqu  %%T1, [%%GDATA + 16*9]
+                aesenc  %%XMM1, %%T1
+                aesenc  %%XMM2, %%T1
+                aesenc  %%XMM3, %%T1
+                aesenc  %%XMM4, %%T1
+                aesenc  %%XMM5, %%T1
+                aesenc  %%XMM6, %%T1
+                aesenc  %%XMM7, %%T1
+                aesenc  %%XMM8, %%T1
+
+
+%ifdef GCM128_MODE
+		movdqu	%%T5, [%%GDATA + 16*10]
+%endif
+%ifdef GCM192_MODE
+		movdqu	%%T1, [%%GDATA + 16*10]
+		aesenc	%%XMM1, %%T1
+		aesenc	%%XMM2, %%T1
+		aesenc	%%XMM3, %%T1
+		aesenc	%%XMM4, %%T1
+		aesenc	%%XMM5, %%T1
+		aesenc	%%XMM6, %%T1
+		aesenc	%%XMM7, %%T1
+		aesenc	%%XMM8, %%T1
+
+		movdqu	%%T1, [%%GDATA + 16*11]
+		aesenc	%%XMM1, %%T1
+		aesenc	%%XMM2, %%T1
+		aesenc	%%XMM3, %%T1
+		aesenc	%%XMM4, %%T1
+		aesenc	%%XMM5, %%T1
+		aesenc	%%XMM6, %%T1
+		aesenc	%%XMM7, %%T1
+		aesenc	%%XMM8, %%T1
+
+		movdqu	%%T5, [%%GDATA + 16*12]        ; finish last key round
+%endif
+%ifdef GCM256_MODE
+		movdqu	%%T1, [%%GDATA + 16*10]
+		aesenc	%%XMM1, %%T1
+		aesenc	%%XMM2, %%T1
+		aesenc	%%XMM3, %%T1
+		aesenc	%%XMM4, %%T1
+		aesenc	%%XMM5, %%T1
+		aesenc	%%XMM6, %%T1
+		aesenc	%%XMM7, %%T1
+		aesenc	%%XMM8, %%T1
+
+		movdqu	%%T1, [%%GDATA + 16*11]
+		aesenc	%%XMM1, %%T1
+		aesenc	%%XMM2, %%T1
+		aesenc	%%XMM3, %%T1
+		aesenc	%%XMM4, %%T1
+		aesenc	%%XMM5, %%T1
+		aesenc	%%XMM6, %%T1
+		aesenc	%%XMM7, %%T1
+		aesenc	%%XMM8, %%T1
+
+		movdqu	%%T1, [%%GDATA + 16*12]
+		aesenc	%%XMM1, %%T1
+		aesenc	%%XMM2, %%T1
+		aesenc	%%XMM3, %%T1
+		aesenc	%%XMM4, %%T1
+		aesenc	%%XMM5, %%T1
+		aesenc	%%XMM6, %%T1
+		aesenc	%%XMM7, %%T1
+		aesenc	%%XMM8, %%T1
+
+		movdqu	%%T1, [%%GDATA + 16*13]
+		aesenc	%%XMM1, %%T1
+		aesenc	%%XMM2, %%T1
+		aesenc	%%XMM3, %%T1
+		aesenc	%%XMM4, %%T1
+		aesenc	%%XMM5, %%T1
+		aesenc	%%XMM6, %%T1
+		aesenc	%%XMM7, %%T1
+		aesenc	%%XMM8, %%T1
+
+	        movdqu	%%T5, [%%GDATA + 16*14]        ; finish last key round
+%endif
+
+%assign i 0
+%assign j 1
+%rep 8
+                XLDR  %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+
+%ifidn %%ENC_DEC, DEC
+                movdqa  %%T3, %%T1
+%endif
+
+                pxor    %%T1, %%T5
+                aesenclast      reg(j), %%T1          ; XMM1:XMM8
+                XSTR  [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], reg(j)       ; Write to the Output buffer
+
+%ifidn %%ENC_DEC, DEC
+                movdqa  reg(j), %%T3
+%endif
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+
+
+
+        pxor    %%T2, %%T6
+        pxor    %%T2, %%T4
+        pxor    %%T2, %%T7
+
+
+        movdqa  %%T3, %%T2
+        pslldq  %%T3, 8                                 ; shift-L %%T3 2 DWs
+        psrldq  %%T2, 8                                 ; shift-R %%T2 2 DWs
+        pxor    %%T7, %%T3
+        pxor    %%T4, %%T2                              ; accumulate the results in %%T4:%%T7
+
+
+
+        ;first phase of the reduction
+        movdqa  %%T2, %%T7
+        movdqa  %%T3, %%T7
+        movdqa  %%T1, %%T7                              ; move %%T7 into %%T2, %%T3, %%T1 in order to perform the three shifts independently
+
+        pslld   %%T2, 31                                ; packed right shifting << 31
+        pslld   %%T3, 30                                ; packed right shifting shift << 30
+        pslld   %%T1, 25                                ; packed right shifting shift << 25
+        pxor    %%T2, %%T3                              ; xor the shifted versions
+        pxor    %%T2, %%T1
+
+        movdqa  %%T5, %%T2
+        psrldq  %%T5, 4                                 ; shift-R %%T5 1 DW
+
+        pslldq  %%T2, 12                                ; shift-L %%T2 3 DWs
+        pxor    %%T7, %%T2                              ; first phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+                pshufb  %%XMM1, [SHUF_MASK]     ; perform a 16Byte swap
+                pshufb  %%XMM2, [SHUF_MASK]     ; perform a 16Byte swap
+                pshufb  %%XMM3, [SHUF_MASK]     ; perform a 16Byte swap
+                pshufb  %%XMM4, [SHUF_MASK]     ; perform a 16Byte swap
+                pshufb  %%XMM5, [SHUF_MASK]     ; perform a 16Byte swap
+                pshufb  %%XMM6, [SHUF_MASK]     ; perform a 16Byte swap
+                pshufb  %%XMM7, [SHUF_MASK]     ; perform a 16Byte swap
+                pshufb  %%XMM8, [SHUF_MASK]     ; perform a 16Byte swap
+
+        ;second phase of the reduction
+        movdqa  %%T2,%%T7                               ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T1) for doing three shift operations
+        movdqa  %%T3,%%T7
+        movdqa  %%T1,%%T7
+
+        psrld   %%T2,1                                  ; packed left shifting >> 1
+        psrld   %%T3,2                                  ; packed left shifting >> 2
+        psrld   %%T1,7                                  ; packed left shifting >> 7
+        pxor    %%T2,%%T3                               ; xor the shifted versions
+        pxor    %%T2,%%T1
+
+        pxor    %%T2, %%T5
+        pxor    %%T7, %%T2
+        pxor    %%T7, %%T4                              ; the result is in %%T4
+
+
+        pxor    %%XMM1, %%T7
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro	GHASH_LAST_8 16
+%define	%%GDATA	%1
+%define	%%T1	%2
+%define	%%T2	%3
+%define	%%T3	%4
+%define	%%T4	%5
+%define	%%T5	%6
+%define	%%T6	%7
+%define	%%T7	%8
+%define	%%XMM1	%9
+%define	%%XMM2	%10
+%define	%%XMM3	%11
+%define	%%XMM4	%12
+%define	%%XMM5	%13
+%define	%%XMM6	%14
+%define	%%XMM7	%15
+%define	%%XMM8	%16
+
+        ; Karatsuba Method
+        movdqa  %%T6, %%XMM1
+        pshufd  %%T2, %%XMM1, 01001110b
+        pxor    %%T2, %%XMM1
+        movdqu  %%T5, [%%GDATA + HashKey_8]
+        pclmulqdq       %%T6, %%T5, 0x11                ; %%T6 = a1*b1
+
+        pclmulqdq       %%XMM1, %%T5, 0x00              ; %%XMM1 = a0*b0
+        movdqu  %%T4, [%%GDATA + HashKey_8_k]
+        pclmulqdq       %%T2, %%T4, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+
+        movdqa  %%T7, %%XMM1
+        movdqa  %%XMM1, %%T2                            ; result in %%T6, %%T7, %%XMM1
+
+
+        ; Karatsuba Method
+        movdqa  %%T1, %%XMM2
+        pshufd  %%T2, %%XMM2, 01001110b
+        pxor    %%T2, %%XMM2
+        movdqu  %%T5, [%%GDATA + HashKey_7]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+
+        pclmulqdq       %%XMM2, %%T5, 0x00              ; %%XMM2 = a0*b0
+        movdqu  %%T4, [%%GDATA + HashKey_7_k]
+        pclmulqdq       %%T2, %%T4, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+
+        pxor    %%T6, %%T1
+        pxor    %%T7, %%XMM2
+        pxor    %%XMM1, %%T2                            ; results accumulated in %%T6, %%T7, %%XMM1
+
+
+        ; Karatsuba Method
+        movdqa  %%T1, %%XMM3
+        pshufd  %%T2, %%XMM3, 01001110b
+        pxor    %%T2, %%XMM3
+        movdqu  %%T5, [%%GDATA + HashKey_6]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+
+        pclmulqdq       %%XMM3, %%T5, 0x00              ; %%XMM3 = a0*b0
+        movdqu  %%T4, [%%GDATA + HashKey_6_k]
+        pclmulqdq       %%T2, %%T4, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+
+        pxor    %%T6, %%T1
+        pxor    %%T7, %%XMM3
+        pxor    %%XMM1, %%T2                            ; results accumulated in %%T6, %%T7, %%XMM1
+
+        ; Karatsuba Method
+        movdqa  %%T1, %%XMM4
+        pshufd  %%T2, %%XMM4, 01001110b
+        pxor    %%T2, %%XMM4
+        movdqu  %%T5, [%%GDATA + HashKey_5]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+
+        pclmulqdq       %%XMM4, %%T5, 0x00              ; %%XMM3 = a0*b0
+        movdqu  %%T4, [%%GDATA + HashKey_5_k]
+        pclmulqdq       %%T2, %%T4, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+
+        pxor    %%T6, %%T1
+        pxor    %%T7, %%XMM4
+        pxor    %%XMM1, %%T2                            ; results accumulated in %%T6, %%T7, %%XMM1
+
+        ; Karatsuba Method
+        movdqa  %%T1, %%XMM5
+        pshufd  %%T2, %%XMM5, 01001110b
+        pxor    %%T2, %%XMM5
+        movdqu  %%T5, [%%GDATA + HashKey_4]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+
+        pclmulqdq       %%XMM5, %%T5, 0x00              ; %%XMM3 = a0*b0
+        movdqu  %%T4, [%%GDATA + HashKey_4_k]
+        pclmulqdq       %%T2, %%T4, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+
+        pxor    %%T6, %%T1
+        pxor    %%T7, %%XMM5
+        pxor    %%XMM1, %%T2                            ; results accumulated in %%T6, %%T7, %%XMM1
+
+        ; Karatsuba Method
+        movdqa  %%T1, %%XMM6
+        pshufd  %%T2, %%XMM6, 01001110b
+        pxor    %%T2, %%XMM6
+        movdqu  %%T5, [%%GDATA + HashKey_3]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+
+        pclmulqdq       %%XMM6, %%T5, 0x00              ; %%XMM3 = a0*b0
+        movdqu  %%T4, [%%GDATA + HashKey_3_k]
+        pclmulqdq       %%T2, %%T4, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+
+        pxor    %%T6, %%T1
+        pxor    %%T7, %%XMM6
+        pxor    %%XMM1, %%T2                            ; results accumulated in %%T6, %%T7, %%XMM1
+
+        ; Karatsuba Method
+        movdqa  %%T1, %%XMM7
+        pshufd  %%T2, %%XMM7, 01001110b
+        pxor    %%T2, %%XMM7
+        movdqu  %%T5, [%%GDATA + HashKey_2]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+
+        pclmulqdq       %%XMM7, %%T5, 0x00              ; %%XMM3 = a0*b0
+        movdqu  %%T4, [%%GDATA + HashKey_2_k]
+        pclmulqdq       %%T2, %%T4, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+
+        pxor    %%T6, %%T1
+        pxor    %%T7, %%XMM7
+        pxor    %%XMM1, %%T2                            ; results accumulated in %%T6, %%T7, %%XMM1
+
+
+        ; Karatsuba Method
+        movdqa  %%T1, %%XMM8
+        pshufd  %%T2, %%XMM8, 01001110b
+        pxor    %%T2, %%XMM8
+        movdqu  %%T5, [%%GDATA + HashKey]
+        pclmulqdq       %%T1, %%T5, 0x11                ; %%T1 = a1*b1
+
+        pclmulqdq       %%XMM8, %%T5, 0x00              ; %%XMM4 = a0*b0
+        movdqu  %%T4, [%%GDATA + HashKey_k]
+        pclmulqdq       %%T2, %%T4, 0x00                ; %%T2 = (a1+a0)*(b1+b0)
+
+        pxor    %%T6, %%T1
+        pxor    %%T7, %%XMM8
+        pxor    %%T2, %%XMM1
+        pxor    %%T2, %%T6
+        pxor    %%T2, %%T7                              ; middle section of the temp results combined as in Karatsuba algorithm
+
+
+        movdqa  %%T4, %%T2
+        pslldq  %%T4, 8                                 ; shift-L %%T4 2 DWs
+        psrldq  %%T2, 8                                 ; shift-R %%T2 2 DWs
+        pxor    %%T7, %%T4
+        pxor    %%T6, %%T2                              ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+
+        ;first phase of the reduction
+        movdqa %%T2, %%T7
+        movdqa %%T3, %%T7
+        movdqa %%T4, %%T7                               ; move %%T7 into %%T2, %%T3, %%T4 in order to perform the three shifts independently
+
+        pslld %%T2, 31                                  ; packed right shifting << 31
+        pslld %%T3, 30                                  ; packed right shifting shift << 30
+        pslld %%T4, 25                                  ; packed right shifting shift << 25
+        pxor %%T2, %%T3                                 ; xor the shifted versions
+        pxor %%T2, %%T4
+
+        movdqa %%T1, %%T2
+        psrldq %%T1, 4                                  ; shift-R %%T1 1 DW
+
+        pslldq %%T2, 12                                 ; shift-L %%T2 3 DWs
+        pxor %%T7, %%T2                                 ; first phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+        ;second phase of the reduction
+        movdqa %%T2,%%T7                                ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T4) for doing three shift operations
+        movdqa %%T3,%%T7
+        movdqa %%T4,%%T7
+
+        psrld %%T2,1                                    ; packed left shifting >> 1
+        psrld %%T3,2                                    ; packed left shifting >> 2
+        psrld %%T4,7                                    ; packed left shifting >> 7
+        pxor %%T2,%%T3                                  ; xor the shifted versions
+        pxor %%T2,%%T4
+
+        pxor %%T2, %%T1
+        pxor %%T7, %%T2
+        pxor %%T6, %%T7                                 ; the result is in %%T6
+
+%endmacro
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 3
+%define	%%GDATA	%1
+%define	%%ST	%2
+%define	%%T1	%3
+		movdqu	%%T1, [%%GDATA+16*0]
+                pxor    %%ST, %%T1
+%assign i 1
+%rep NROUNDS
+		movdqu	%%T1, [%%GDATA+16*i]
+                aesenc  %%ST, %%T1
+%assign i (i+1)
+%endrep
+		movdqu	%%T1, [%%GDATA+16*i]
+                aesenclast      %%ST, %%T1
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+	;; Required for Update/GMC_ENC
+	;the number of pushes must equal STACK_OFFSET
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+        push    rsi
+        mov     r14, rsp
+
+	sub     rsp, VARIABLE_OFFSET
+	and     rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+        ; xmm6:xmm15 need to be maintained for Windows
+        movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+        movdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+        movdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+        movdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+        movdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+        movdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+        movdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+        movdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+        movdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+        movdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+
+        mov	arg5, arg(5) ;[r14 + STACK_OFFSET + 8*5]
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+        movdqu xmm15  , [rsp + LOCAL_STORAGE + 9*16]
+        movdqu xmm14  , [rsp + LOCAL_STORAGE + 8*16]
+        movdqu xmm13  , [rsp + LOCAL_STORAGE + 7*16]
+        movdqu xmm12  , [rsp + LOCAL_STORAGE + 6*16]
+        movdqu xmm11  , [rsp + LOCAL_STORAGE + 5*16]
+        movdqu xmm10  , [rsp + LOCAL_STORAGE + 4*16]
+        movdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+        movdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+        movdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+        movdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+        mov     rsp, r14
+        pop     rsi
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
+; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
+; Additional Authentication data (A_IN), Additional Data length (A_LEN).
+; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13 and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro  GCM_INIT 	5
+%define %%GDATA_KEY	%1
+%define %%GDATA_CTX	%2
+%define %%IV		%3
+%define %%A_IN		%4
+%define %%A_LEN		%5
+%define %%AAD_HASH	xmm0
+%define %%SUBHASH	xmm1
+
+
+        movdqu  %%SUBHASH, [%%GDATA_KEY + HashKey]
+
+	CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+	pxor	xmm2, xmm3
+	mov	r10, %%A_LEN
+
+	movdqu	[%%GDATA_CTX + AadHash], %%AAD_HASH	; ctx_data.aad hash = aad_hash
+	mov	[%%GDATA_CTX + AadLen], r10		; ctx_data.aad_length = aad_length
+	xor	r10, r10
+	mov	[%%GDATA_CTX + InLen], r10		; ctx_data.in_length = 0
+	mov	[%%GDATA_CTX + PBlockLen], r10		; ctx_data.partial_block_length = 0
+	movdqu	[%%GDATA_CTX + PBlockEncKey], xmm2	; ctx_data.partial_block_enc_key = 0
+	mov	r10, %%IV
+        movdqa  xmm2, [rel ONEf]                        ; read 12 IV bytes and pad with 0x00000001
+        pinsrq  xmm2, [r10], 0
+        pinsrd  xmm2, [r10+8], 2
+	movdqu	[%%GDATA_CTX + OrigIV], xmm2		; ctx_data.orig_IV = iv
+
+	pshufb xmm2, [SHUF_MASK]
+
+	movdqu	[%%GDATA_CTX + CurCount], xmm2		; ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data
+; struct has been initialized by GCM_INIT.
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro	GCM_ENC_DEC		6
+%define	%%GDATA_KEY		%1
+%define	%%GDATA_CTX		%2
+%define	%%CYPH_PLAIN_OUT	%3
+%define	%%PLAIN_CYPH_IN		%4
+%define	%%PLAIN_CYPH_LEN	%5
+%define	%%ENC_DEC		%6
+%define	%%DATA_OFFSET		r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+	cmp	%%PLAIN_CYPH_LEN, 0
+	je	%%_multiple_of_16_bytes
+
+	xor	%%DATA_OFFSET, %%DATA_OFFSET
+	add	[%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
+	movdqu	xmm13, [%%GDATA_KEY + HashKey]                 ; xmm13 = HashKey
+	movdqu	xmm8, [%%GDATA_CTX + AadHash]
+
+
+	PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+        mov     r13, %%PLAIN_CYPH_LEN                               ; save the number of bytes of plaintext/ciphertext
+	sub	r13, %%DATA_OFFSET
+	mov	r10, r13	;save the amount of data left to process in r10
+        and     r13, -16                                ; r13 = r13 - (r13 mod 16)
+
+        mov     r12, r13
+        shr     r12, 4
+        and     r12, 7
+        jz      %%_initial_num_blocks_is_0
+
+        cmp     r12, 7
+        je      %%_initial_num_blocks_is_7
+        cmp     r12, 6
+        je      %%_initial_num_blocks_is_6
+        cmp     r12, 5
+        je      %%_initial_num_blocks_is_5
+        cmp     r12, 4
+        je      %%_initial_num_blocks_is_4
+        cmp     r12, 3
+        je      %%_initial_num_blocks_is_3
+        cmp     r12, 2
+        je      %%_initial_num_blocks_is_2
+
+        jmp     %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16*7
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16*6
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16*5
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16*4
+        jmp     %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16*3
+        jmp     %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16*2
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+        sub     r13, 16
+        jmp     %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+	INITIAL_BLOCKS	%%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+        cmp     r13, 0
+        je      %%_zero_cipher_left
+
+        sub     r13, 128
+        je      %%_eight_cipher_left
+
+
+
+
+        movd    r15d, xmm9
+        and     r15d, 255
+        pshufb  xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+        cmp     r15d, 255-8
+        jg      %%_encrypt_by_8
+
+
+
+        add     r15b, 8
+	GHASH_8_ENCRYPT_8_PARALLEL	%%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+        add     %%DATA_OFFSET, 128
+        sub     r13, 128
+        jne     %%_encrypt_by_8_new
+
+        pshufb  xmm9, [SHUF_MASK]
+        jmp     %%_eight_cipher_left
+
+%%_encrypt_by_8:
+        pshufb  xmm9, [SHUF_MASK]
+        add     r15b, 8
+	GHASH_8_ENCRYPT_8_PARALLEL	%%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+        pshufb  xmm9, [SHUF_MASK]
+        add     %%DATA_OFFSET, 128
+        sub     r13, 128
+        jne     %%_encrypt_by_8_new
+
+        pshufb  xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+	GHASH_LAST_8	%%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+	movdqu	[%%GDATA_CTX + AadHash], xmm14
+	movdqu	[%%GDATA_CTX + CurCount], xmm9
+
+        mov     r13, r10
+        and     r13, 15                                ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+        je      %%_multiple_of_16_bytes
+
+	mov	[%%GDATA_CTX + PBlockLen], r13		; my_ctx.data.partial_blck_length = r13
+        ; handle the last <16 Byte block seperately
+
+        paddd   xmm9, [ONE]                     ; INCR CNT to get Yn
+	movdqu	[%%GDATA_CTX + CurCount], xmm9		; my_ctx.data.current_counter = xmm9
+        pshufb  xmm9, [SHUF_MASK]
+	ENCRYPT_SINGLE_BLOCK	%%GDATA_KEY, xmm9, xmm2                    ; E(K, Yn)
+	movdqu	[%%GDATA_CTX + PBlockEncKey], xmm9		; my_ctx_data.partial_block_enc_key = xmm9
+
+	cmp	%%PLAIN_CYPH_LEN, 16
+	jge	%%_large_enough_update
+
+	lea	r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+	READ_SMALL_DATA_INPUT	xmm1, r10, r13, r12, r15, rax
+	lea	r12, [SHIFT_MASK + 16]
+	sub	r12, r13
+	jmp	%%_data_read
+
+%%_large_enough_update:
+        sub     %%DATA_OFFSET, 16
+        add     %%DATA_OFFSET, r13
+
+        movdqu  xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET]                        ; receive the last <16 Byte block
+
+	sub     %%DATA_OFFSET, r13
+        add     %%DATA_OFFSET, 16
+
+        lea     r12, [SHIFT_MASK + 16]
+        sub     r12, r13                                ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+        movdqu  xmm2, [r12]                             ; get the appropriate shuffle mask
+        pshufb  xmm1, xmm2                              ; shift right 16-r13 bytes
+%%_data_read:
+        %ifidn  %%ENC_DEC, DEC
+        movdqa  xmm2, xmm1
+        pxor    xmm9, xmm1                              ; Plaintext XOR E(K, Yn)
+        movdqu  xmm1, [r12 + ALL_F - SHIFT_MASK]        ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+        pand    xmm9, xmm1                              ; mask out top 16-r13 bytes of xmm9
+        pand    xmm2, xmm1
+        pshufb  xmm2, [SHUF_MASK]
+        pxor    xmm14, xmm2
+	movdqu	[%%GDATA_CTX + AadHash], xmm14
+
+        %else
+        pxor    xmm9, xmm1                              ; Plaintext XOR E(K, Yn)
+        movdqu  xmm1, [r12 + ALL_F - SHIFT_MASK]        ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+        pand    xmm9, xmm1                              ; mask out top 16-r13 bytes of xmm9
+        pshufb  xmm9, [SHUF_MASK]
+        pxor    xmm14, xmm9
+	movdqu	[%%GDATA_CTX + AadHash], xmm14
+
+        pshufb  xmm9, [SHUF_MASK]               ; shuffle xmm9 back to output as ciphertext
+        %endif
+
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ; output r13 Bytes
+        movq    rax, xmm9
+        cmp     r13, 8
+        jle     %%_less_than_8_bytes_left
+
+        mov     [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+        add     %%DATA_OFFSET, 8
+        psrldq  xmm9, 8
+        movq    rax, xmm9
+        sub     r13, 8
+
+%%_less_than_8_bytes_left:
+        mov     BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+        add     %%DATA_OFFSET, 1
+        shr     rax, 8
+        sub     r13, 1
+        jne     %%_less_than_8_bytes_left
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data * (GDATA_CTX) and
+; whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro	GCM_COMPLETE		5
+%define	%%GDATA_KEY		%1
+%define	%%GDATA_CTX		%2
+%define	%%AUTH_TAG		%3
+%define	%%AUTH_TAG_LEN		%4
+%define	%%ENC_DEC		%5
+%define	%%PLAIN_CYPH_LEN	rax
+
+        mov     r12, [%%GDATA_CTX + PBlockLen]		; r12 = aadLen (number of bytes)
+	movdqu	xmm14, [%%GDATA_CTX + AadHash]
+	movdqu	xmm13, [%%GDATA_KEY + HashKey]
+
+	cmp	r12, 0
+
+	je %%_partial_done
+
+	GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+	movdqu	[%%GDATA_CTX + AadHash], xmm14
+
+%%_partial_done:
+
+	mov	r12, [%%GDATA_CTX + AadLen]			; r12 = aadLen (number of bytes)
+	mov	%%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
+
+        shl     r12, 3                                  ; convert into number of bits
+        movd    xmm15, r12d                             ; len(A) in xmm15
+
+        shl     %%PLAIN_CYPH_LEN, 3                     ; len(C) in bits  (*128)
+        movq    xmm1, %%PLAIN_CYPH_LEN
+        pslldq  xmm15, 8                                ; xmm15 = len(A)|| 0x0000000000000000
+        pxor    xmm15, xmm1                             ; xmm15 = len(A)||len(C)
+
+        pxor    xmm14, xmm15
+        GHASH_MUL       xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6    ; final GHASH computation
+        pshufb  xmm14, [SHUF_MASK]                      ; perform a 16Byte swap
+
+        movdqu  xmm9, [%%GDATA_CTX + OrigIV]            ; xmm9 = Y0
+
+	ENCRYPT_SINGLE_BLOCK	%%GDATA_KEY, xmm9, xmm2	; E(K, Y0)
+
+        pxor    xmm9, xmm14
+
+
+
+%%_return_T:
+	mov	r10, %%AUTH_TAG				; r10 = authTag
+	mov	r11, %%AUTH_TAG_LEN			; r11 = auth_tag_len
+
+        cmp     r11, 16
+        je      %%_T_16
+
+        cmp     r11, 12
+        je      %%_T_12
+
+%%_T_8:
+        movq    rax, xmm9
+        mov     [r10], rax
+        jmp     %%_return_T_done
+%%_T_12:
+        movq    rax, xmm9
+        mov     [r10], rax
+        psrldq  xmm9, 8
+        movd    eax, xmm9
+        mov     [r10 + 8], eax
+        jmp     %%_return_T_done
+
+%%_T_16:
+        movdqu  [r10], xmm9
+
+%%_return_T_done:
+%endmacro ;GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void	aes_gcm_precomp_128_sse / aes_gcm_precomp_192_sse / aes_gcm_precomp_256_sse
+;        (struct gcm_key_data *key_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(precomp,_)
+FN_NAME(precomp,_):
+	endbranch
+
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+
+        mov     r14, rsp
+
+
+
+        sub     rsp, VARIABLE_OFFSET
+        and     rsp, ~63                                ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+        ; only xmm6 needs to be maintained
+        movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+	pxor	xmm6, xmm6
+	ENCRYPT_SINGLE_BLOCK	arg1, xmm6, xmm2	; xmm6 = HashKey
+
+        pshufb  xmm6, [SHUF_MASK]
+        ;;;;;;;;;;;;;;;  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+        movdqa  xmm2, xmm6
+        psllq   xmm6, 1
+        psrlq   xmm2, 63
+        movdqa  xmm1, xmm2
+        pslldq  xmm2, 8
+        psrldq  xmm1, 8
+        por     xmm6, xmm2
+        ;reduction
+        pshufd  xmm2, xmm1, 00100100b
+        pcmpeqd xmm2, [TWOONE]
+        pand    xmm2, [POLY]
+        pxor    xmm6, xmm2                             ; xmm6 holds the HashKey<<1 mod poly
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        movdqu  [arg1 + HashKey], xmm6                  ; store HashKey<<1 mod poly
+
+
+        PRECOMPUTE  arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+       movdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+        mov     rsp, r14
+
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+ret
+%endif	; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_init_128_sse / aes_gcm_init_192_sse / aes_gcm_init_256_sse (
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *iv,
+;        const   u8 *aad,
+;        u64     aad_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(init,_)
+FN_NAME(init,_):
+	endbranch
+
+	push	r12
+	push	r13
+%ifidn __OUTPUT_FORMAT__, win64
+	; xmm6:xmm15 need to be maintained for Windows
+        push    arg5
+	sub	rsp, 1*16
+	movdqu	[rsp + 0*16],xmm6
+        mov     arg5, [rsp + 1*16 + 8*3 + 8*5]
+%endif
+
+	GCM_INIT arg1, arg2, arg3, arg4, arg5
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqu	xmm6 , [rsp + 0*16]
+	add	rsp, 1*16
+        pop     arg5
+%endif
+	pop	r13
+	pop	r12
+        ret
+%endif	; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_enc_128_update_sse / aes_gcm_enc_192_update_sse / aes_gcm_enc_256_update_sse
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *out,
+;        const   u8 *in,
+;        u64     plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(enc,_update_)
+FN_NAME(enc,_update_):
+	endbranch
+
+	FUNC_SAVE
+
+	GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
+
+	FUNC_RESTORE
+
+	ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_dec_256_update_sse / aes_gcm_dec_192_update_sse / aes_gcm_dec_256_update_sse
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *out,
+;        const   u8 *in,
+;        u64     plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(dec,_update_)
+FN_NAME(dec,_update_):
+	endbranch
+
+	FUNC_SAVE
+
+	GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
+
+	FUNC_RESTORE
+
+	ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_enc_128_finalize_sse / aes_gcm_enc_192_finalize_sse / aes_gcm_enc_256_finalize_sse
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *auth_tag,
+;        u64     auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(enc,_finalize_)
+FN_NAME(enc,_finalize_):
+	endbranch
+
+	push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+	; xmm6:xmm15 need to be maintained for Windows
+	sub	rsp, 5*16
+	movdqu	[rsp + 0*16],xmm6
+	movdqu	[rsp + 1*16],xmm9
+	movdqu	[rsp + 2*16],xmm11
+	movdqu	[rsp + 3*16],xmm14
+	movdqu	[rsp + 4*16],xmm15
+%endif
+	GCM_COMPLETE	arg1, arg2, arg3, arg4, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqu	xmm15  , [rsp + 4*16]
+	movdqu	xmm14  , [rsp+ 3*16]
+	movdqu	xmm11  , [rsp + 2*16]
+	movdqu	xmm9 , [rsp + 1*16]
+	movdqu	xmm6 , [rsp + 0*16]
+	add	rsp, 5*16
+%endif
+
+	pop r12
+        ret
+%endif	; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_dec_128_finalize_sse / aes_gcm_dec_192_finalize_sse / aes_gcm_dec_256_finalize_sse
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *auth_tag,
+;        u64     auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(dec,_finalize_)
+FN_NAME(dec,_finalize_):
+	endbranch
+
+	push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+	; xmm6:xmm15 need to be maintained for Windows
+	sub	rsp, 5*16
+	movdqu	[rsp + 0*16],xmm6
+	movdqu	[rsp + 1*16],xmm9
+	movdqu	[rsp + 2*16],xmm11
+	movdqu	[rsp + 3*16],xmm14
+	movdqu	[rsp + 4*16],xmm15
+%endif
+	GCM_COMPLETE	arg1, arg2, arg3, arg4, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqu	xmm15  , [rsp + 4*16]
+	movdqu	xmm14  , [rsp+ 3*16]
+	movdqu	xmm11  , [rsp + 2*16]
+	movdqu	xmm9 , [rsp + 1*16]
+	movdqu	xmm6 , [rsp + 0*16]
+	add	rsp, 5*16
+%endif
+
+	pop r12
+        ret
+%endif	; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_enc_128_sse / aes_gcm_enc_192_sse / aes_gcm_enc_256_sse
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *out,
+;        const   u8 *in,
+;        u64     plaintext_len,
+;        u8      *iv,
+;        const   u8 *aad,
+;        u64     aad_len,
+;        u8      *auth_tag,
+;        u64     auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(enc,_)
+FN_NAME(enc,_):
+	endbranch
+
+	FUNC_SAVE
+
+	GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+	GCM_ENC_DEC  arg1, arg2, arg3, arg4, arg5, ENC
+
+	GCM_COMPLETE arg1, arg2, arg9, arg10, ENC
+
+	FUNC_RESTORE
+
+	ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_dec_128_sse / aes_gcm_dec_192_sse / aes_gcm_dec_256_sse
+;        const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8      *out,
+;        const   u8 *in,
+;        u64     plaintext_len,
+;        u8      *iv,
+;        const   u8 *aad,
+;        u64     aad_len,
+;        u8      *auth_tag,
+;        u64     auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(dec,_)
+FN_NAME(dec,_):
+	endbranch
+
+	FUNC_SAVE
+
+	GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+	GCM_ENC_DEC  arg1, arg2, arg3, arg4, arg5, DEC
+
+	GCM_COMPLETE arg1, arg2, arg9, arg10, DEC
+
+	FUNC_RESTORE
+
+	ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c
new file mode 100644
index 000000000..b0a6221d5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c
@@ -0,0 +1,1940 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>		// for memcmp
+#include <aes_gcm.h>
+#include <openssl/sha.h>
+#include "gcm_vectors.h"
+#include "ossl_helper.h"
+#include "types.h"
+
+//#define GCM_VECTORS_VERBOSE
+//#define GCM_VECTORS_EXTRA_VERBOSE
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#ifndef RANDOMS
+# define RANDOMS  200
+#endif
+#ifndef TEST_LEN
+# define TEST_LEN  32*1024
+#endif
+#ifndef PAGE_LEN
+# define PAGE_LEN  (4*1024)
+#endif
+
+#if defined(NT_LD) || defined(NT_ST) || defined(NT_LDST)
+# define ALIGNMENT_MASK (~15)
+# define OFFSET_BASE_VALUE 16
+#ifndef MAX_UNALIGNED
+# define MAX_UNALIGNED  (1)
+#endif
+#else
+# define ALIGNMENT_MASK (~0)
+# define OFFSET_BASE_VALUE 1
+#ifndef MAX_UNALIGNED
+# define MAX_UNALIGNED  (16)
+#endif
+#endif
+
+void dump_table(char *title, uint8_t * table, uint8_t count)
+{
+	int i;
+	char const *space = "   ";
+
+	printf("%s%s => {\n", space, title);
+	for (i = 0; i < count; i++) {
+		if (0 == (i & 15))
+			printf("%s%s", space, space);
+		printf("%2x, ", table[i]);
+		if (15 == (i & 15))
+			printf("\n");
+
+	}
+	printf("%s}\n", space);
+}
+
+void dump_gcm_data(struct gcm_key_data *gkey)
+{
+#ifdef GCM_VECTORS_EXTRA_VERBOSE
+	printf("gcm_data {\n");
+	dump_table("expanded_keys", gkey->expanded_keys, (16 * 11));
+	dump_table("shifted_hkey_1", gkey->shifted_hkey_1, 16);
+	dump_table("shifted_hkey_2", gkey->shifted_hkey_2, 16);
+	dump_table("shifted_hkey_3", gkey->shifted_hkey_3, 16);
+	dump_table("shifted_hkey_4", gkey->shifted_hkey_4, 16);
+	dump_table("shifted_hkey_5", gkey->shifted_hkey_5, 16);
+	dump_table("shifted_hkey_6", gkey->shifted_hkey_6, 16);
+	dump_table("shifted_hkey_7", gkey->shifted_hkey_7, 16);
+	dump_table("shifted_hkey_8", gkey->shifted_hkey_8, 16);
+	dump_table("shifted_hkey_1_k", gkey->shifted_hkey_1_k, 16);
+	dump_table("shifted_hkey_2_k", gkey->shifted_hkey_2_k, 16);
+	dump_table("shifted_hkey_3_k", gkey->shifted_hkey_3_k, 16);
+	dump_table("shifted_hkey_4_k", gkey->shifted_hkey_4_k, 16);
+	dump_table("shifted_hkey_5_k", gkey->shifted_hkey_5_k, 16);
+	dump_table("shifted_hkey_6_k", gkey->shifted_hkey_6_k, 16);
+	dump_table("shifted_hkey_7_k", gkey->shifted_hkey_7_k, 16);
+	dump_table("shifted_hkey_8_k", gkey->shifted_hkey_8_k, 16);
+	printf("}\n");
+#endif //GCM_VECTORS_VERBOSE
+}
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+	int i;
+	for (i = 0; i < size; i++) {
+		*data++ = rand();
+	}
+}
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+	int mismatch;
+	int OK = 0;
+
+	mismatch = memcmp(test, expected, len);
+	if (mismatch) {
+		OK = 1;
+		printf("  expected results don't match %s \t\t", data_name);
+		{
+			uint64_t a;
+			for (a = 0; a < len; a++) {
+				if (test[a] != expected[a]) {
+					printf(" '%x' != '%x' at %lx of %lx\n",
+					       test[a], expected[a], a, len);
+					break;
+				}
+			}
+		}
+	}
+	return OK;
+}
+
+int check_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, gcm_vector * vector)
+{
+	uint8_t *pt_test = NULL;
+	uint8_t *ct_test = NULL;
+	uint8_t *o_ct_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *o_T_test = NULL;
+	uint64_t IV_alloc_len = 0;
+	int result;
+	int OK = 0;
+
+#ifdef GCM_VECTORS_VERBOSE
+	printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+	       (int)vector->Klen,
+	       (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+	printf(".");
+#endif
+	// Allocate space for the calculated ciphertext
+	if (vector->Plen != 0) {
+		pt_test = malloc(vector->Plen);
+		ct_test = malloc(vector->Plen);
+		o_ct_test = malloc(vector->Plen);
+		if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+			fprintf(stderr, "Can't allocate ciphertext memory\n");
+			return 1;
+		}
+	}
+	IV_alloc_len = vector->IVlen;
+	// Allocate space for the calculated ciphertext
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	o_T_test = malloc(vector->Tlen);
+	if ((T_test == NULL) || (o_T_test == NULL)) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_128(vector->K, gkey);
+
+	////
+	// ISA-l Encrypt
+	////
+	aes_gcm_enc_128(gkey, gctx, vector->C, vector->P, vector->Plen,
+			IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	openssl_aes_gcm_enc(vector->K, vector->IV,
+			    vector->IVlen, vector->A, vector->Alen, o_T_test,
+			    vector->Tlen, vector->P, vector->Plen, o_ct_test);
+	OK |=
+	    check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+	memcpy(ct_test, vector->C, vector->Plen);
+	memcpy(pt_test, vector->P, vector->Plen);
+	memset(vector->P, 0, vector->Plen);
+	memcpy(T_test, vector->T, vector->Tlen);
+	memset(vector->T, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+	aes_gcm_dec_128(gkey, gctx, vector->P, vector->C, vector->Plen,
+			IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	memset(vector->P, 0, vector->Plen);
+	aes_gcm_dec_128(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+			IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	result =
+	    openssl_aes_gcm_dec(vector->K, vector->IV,
+				vector->IVlen, vector->A, vector->Alen,
+				vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+	if (-1 == result)
+		printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+	OK |= (-1 == result);
+	free(T_test);
+	free(o_T_test);
+	free(IV_c);
+	free(pt_test);
+	free(ct_test);
+	free(o_ct_test);
+
+	return OK;
+}
+
+int check_strm_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+		      gcm_vector * vector, int test_len)
+{
+	uint8_t *pt_test = NULL;
+	uint8_t *ct_test = NULL;
+	uint8_t *o_ct_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *o_T_test = NULL;
+	uint8_t *stream = NULL;
+	uint64_t IV_alloc_len = 0;
+	int result;
+	int OK = 0;
+	uint32_t last_break;
+	int i;
+	uint8_t *rand_data = NULL;
+	uint64_t length;
+
+	rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+	printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+	       (int)vector->Klen,
+	       (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+	printf(".");
+#endif
+	// Allocate space for the calculated ciphertext
+	if (vector->Plen != 0) {
+		pt_test = malloc(vector->Plen);
+		ct_test = malloc(vector->Plen);
+		o_ct_test = malloc(vector->Plen);
+		if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+			fprintf(stderr, "Can't allocate ciphertext memory\n");
+			return 1;
+		}
+	}
+	IV_alloc_len = vector->IVlen;
+	// Allocate space for the calculated ciphertext
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	o_T_test = malloc(vector->Tlen);
+	if ((T_test == NULL) || (o_T_test == NULL)) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_128(vector->K, gkey);
+
+	////
+	// ISA-l Encrypt
+	////
+	aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+
+	last_break = 0;
+	i = (rand() % test_len / 32) & ALIGNMENT_MASK;
+	while (i < (vector->Plen)) {
+		if (i - last_break != 0) {
+			stream = malloc(i - last_break);
+			memcpy(stream, vector->P + last_break, i - last_break);
+		}
+		aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, stream,
+				       i - last_break);
+		if (i - last_break != 0)
+			free(stream);
+
+		if (rand() % 1024 == 0) {
+			length = rand() % 100;
+			mk_rand_data(rand_data, length);
+			SHA1(rand_data, length, rand_data);
+		}
+		last_break = i;
+		i = (rand() % test_len / 32) & ALIGNMENT_MASK;
+
+	}
+	aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, vector->P + last_break,
+			       vector->Plen - last_break);
+	if (gctx->in_length != vector->Plen)
+		printf("%lu, %lu\n", gctx->in_length, vector->Plen);
+	aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+	openssl_aes_gcm_enc(vector->K, vector->IV,
+			    vector->IVlen, vector->A, vector->Alen, o_T_test,
+			    vector->Tlen, vector->P, vector->Plen, o_ct_test);
+	OK |=
+	    check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+	memcpy(ct_test, vector->C, vector->Plen);
+	memcpy(pt_test, vector->P, vector->Plen);
+	memset(vector->P, 0, vector->Plen);
+	memcpy(T_test, vector->T, vector->Tlen);
+	memset(vector->T, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+
+	last_break = 0;
+	i = 0;
+	aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+	while (i < (vector->Plen)) {
+		if (rand() % (test_len / 64) == 0) {
+			if (i - last_break != 0) {
+				stream = malloc(i - last_break);
+				memcpy(stream, vector->C + last_break, i - last_break);
+			}
+			aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, stream,
+					       i - last_break);
+			if (i - last_break != 0)
+				free(stream);
+
+			if (rand() % 1024 == 0) {
+				length = rand() % 100;
+
+				mk_rand_data(rand_data, length);
+				SHA1(rand_data, length, rand_data);
+			}
+
+			last_break = i;
+
+		}
+		if (rand() % 1024 != 0)
+			i++;
+
+	}
+	aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, vector->C + last_break,
+			       vector->Plen - last_break);
+	aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	memset(vector->P, 0, vector->Plen);
+	aes_gcm_dec_128(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+			IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	result =
+	    openssl_aes_gcm_dec(vector->K, vector->IV,
+				vector->IVlen, vector->A, vector->Alen,
+				vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+	if (-1 == result)
+		printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+	OK |= (-1 == result);
+	free(T_test);
+	free(o_T_test);
+	free(IV_c);
+	free(pt_test);
+	free(ct_test);
+	free(o_ct_test);
+	free(rand_data);
+
+	return OK;
+}
+
+int check_strm_vector2(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+		       gcm_vector * vector, int length, int start, int breaks)
+{
+	uint8_t *pt_test = NULL;
+	uint8_t *ct_test = NULL;
+	uint8_t *o_ct_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *o_T_test = NULL;
+	uint8_t *stream = NULL;
+	uint64_t IV_alloc_len = 0;
+	int result;
+	int OK = 0;
+	uint32_t last_break = 0;
+	int i = length;
+	uint8_t *rand_data = NULL;
+
+	rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+	printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+	       (int)vector->Klen,
+	       (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+	printf(".");
+#endif
+	// Allocate space for the calculated ciphertext
+	if (vector->Plen != 0) {
+		pt_test = malloc(vector->Plen);
+		ct_test = malloc(vector->Plen);
+		o_ct_test = malloc(vector->Plen);
+		if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+			fprintf(stderr, "Can't allocate ciphertext memory\n");
+			return 1;
+		}
+	}
+	IV_alloc_len = vector->IVlen;
+	// Allocate space for the calculated ciphertext
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	o_T_test = malloc(vector->Tlen);
+	if ((T_test == NULL) || (o_T_test == NULL)) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_128(vector->K, gkey);
+
+	////
+	// ISA-l Encrypt
+	////
+	aes_gcm_enc_128(gkey, gctx, vector->C, vector->P, vector->Plen,
+			IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+	while (i < (vector->Plen)) {
+		if (i - last_break != 0) {
+			stream = malloc(i - last_break);
+			memcpy(stream, vector->P + last_break, i - last_break);
+		}
+		aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, stream,
+				       i - last_break);
+		if (i - last_break != 0)
+			free(stream);
+		last_break = i;
+		i = i + (length - start) / breaks;
+
+	}
+	aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, vector->P + last_break,
+			       vector->Plen - last_break);
+	aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+	openssl_aes_gcm_enc(vector->K, vector->IV,
+			    vector->IVlen, vector->A, vector->Alen, o_T_test,
+			    vector->Tlen, vector->P, vector->Plen, o_ct_test);
+
+	OK |=
+	    check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+	memcpy(ct_test, vector->C, vector->Plen);
+	memcpy(pt_test, vector->P, vector->Plen);
+	memset(vector->P, 0, vector->Plen);
+	memcpy(T_test, vector->T, vector->Tlen);
+	memset(vector->T, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+
+	last_break = 0;
+	i = length;
+	aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+	while (i < (vector->Plen)) {
+		if (i - last_break != 0) {
+			stream = malloc(i - last_break);
+			memcpy(stream, vector->C + last_break, i - last_break);
+		}
+		aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, stream,
+				       i - last_break);
+		if (i - last_break != 0)
+			free(stream);
+		last_break = i;
+		i = i + (length - start) / breaks;
+
+	}
+
+	aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, vector->C + last_break,
+			       vector->Plen - last_break);
+	aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	memset(vector->P, 0, vector->Plen);
+	aes_gcm_dec_128(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+			IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	result =
+	    openssl_aes_gcm_dec(vector->K, vector->IV,
+				vector->IVlen, vector->A, vector->Alen,
+				vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+	if (-1 == result)
+		printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+	OK |= (-1 == result);
+	free(rand_data);
+
+	return OK;
+}
+
+int check_strm_vector_efence(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+			     gcm_vector * vector)
+{
+	uint8_t *pt_test = NULL;
+	uint8_t *ct_test = NULL;
+	uint8_t *o_ct_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *o_T_test = NULL;
+	uint8_t *stream = NULL;
+	uint64_t IV_alloc_len = 0;
+	int result;
+	int OK = 0;
+	uint32_t last_break = 0;
+	int i = 1;
+	uint8_t *rand_data = NULL;
+	uint64_t length;
+
+	rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+	printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+	       (int)vector->Klen,
+	       (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+	printf(".");
+#endif
+	// Allocate space for the calculated ciphertext
+	if (vector->Plen != 0) {
+		pt_test = malloc(vector->Plen);
+		ct_test = malloc(vector->Plen);
+		o_ct_test = malloc(vector->Plen);
+		if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+			fprintf(stderr, "Can't allocate ciphertext memory\n");
+			return 1;
+		}
+	}
+	IV_alloc_len = vector->IVlen;
+	// Allocate space for the calculated ciphertext
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	o_T_test = malloc(vector->Tlen);
+	if ((T_test == NULL) || (o_T_test == NULL)) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_128(vector->K, gkey);
+
+	////
+	// ISA-l Encrypt
+	////
+	aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+	while (i < vector->Plen) {
+		if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) {
+			stream = malloc(PAGE_LEN);
+			i = i & ALIGNMENT_MASK;
+			memcpy(stream + PAGE_LEN - (i - last_break), vector->P + last_break,
+			       i - last_break);
+			aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break,
+					       stream + PAGE_LEN - (i - last_break),
+					       i - last_break);
+			free(stream);
+
+			if (rand() % 1024 == 0) {
+				length = rand() % 100;
+				mk_rand_data(rand_data, length);
+				SHA1(rand_data, length, rand_data);
+			}
+			last_break = i;
+		}
+		if (rand() % 1024 != 0)
+			i++;
+
+	}
+	aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, vector->P + last_break,
+			       vector->Plen - last_break);
+	aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+	openssl_aes_gcm_enc(vector->K, vector->IV,
+			    vector->IVlen, vector->A, vector->Alen, o_T_test,
+			    vector->Tlen, vector->P, vector->Plen, o_ct_test);
+	OK |=
+	    check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+	memcpy(ct_test, vector->C, vector->Plen);
+	memcpy(pt_test, vector->P, vector->Plen);
+	memset(vector->P, 0, vector->Plen);
+	memcpy(T_test, vector->T, vector->Tlen);
+	memset(vector->T, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+
+	last_break = 0;
+	i = 0;
+	aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+	while (i < vector->Plen) {
+		if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) {
+			stream = malloc(PAGE_LEN);
+			i = i & ALIGNMENT_MASK;
+			memcpy(stream + PAGE_LEN - (i - last_break), vector->C + last_break,
+			       i - last_break);
+			aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break,
+					       stream + PAGE_LEN - (i - last_break),
+					       i - last_break);
+			free(stream);
+
+			if (rand() % 1024 == 0) {
+				length = rand() % 100;
+
+				mk_rand_data(rand_data, length);
+				SHA1(rand_data, length, rand_data);
+			}
+
+			last_break = i;
+
+		}
+		if (rand() % 1024 != 0)
+			i++;
+
+	}
+	aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, vector->C + last_break,
+			       vector->Plen - last_break);
+	aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	memset(vector->P, 0, vector->Plen);
+	aes_gcm_dec_128(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+			IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	result =
+	    openssl_aes_gcm_dec(vector->K, vector->IV,
+				vector->IVlen, vector->A, vector->Alen,
+				vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+	if (-1 == result)
+		printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+	OK |= (-1 == result);
+	free(T_test);
+	free(o_T_test);
+	free(IV_c);
+	free(pt_test);
+	free(ct_test);
+	free(o_ct_test);
+	free(rand_data);
+
+	return OK;
+}
+
+int check_256_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+		     gcm_vector * vector)
+{
+	uint8_t *pt_test = NULL;
+	uint8_t *ct_test = NULL;
+	uint8_t *o_ct_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *o_T_test = NULL;
+	uint64_t IV_alloc_len = 0;
+	int result;
+	int OK = 0;
+
+#ifdef GCM_VECTORS_VERBOSE
+	printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+	       (int)vector->Klen,
+	       (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+	printf(".");
+#endif
+	// Allocate space for the calculated ciphertext
+	if (vector->Plen != 0) {
+		pt_test = malloc(vector->Plen);
+		ct_test = malloc(vector->Plen);
+		o_ct_test = malloc(vector->Plen);
+		if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+			fprintf(stderr, "Can't allocate ciphertext memory\n");
+			return 1;
+		}
+	}
+	IV_alloc_len = vector->IVlen;
+	// Allocate space for the calculated ciphertext
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	o_T_test = malloc(vector->Tlen);
+	if ((T_test == NULL) || (o_T_test == NULL)) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_256(vector->K, gkey);
+
+	////
+	// ISA-l Encrypt
+	////
+	aes_gcm_enc_256(gkey, gctx, vector->C, vector->P, vector->Plen,
+			IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	openssl_aes_256_gcm_enc(vector->K, vector->IV,
+				vector->IVlen, vector->A, vector->Alen, o_T_test,
+				vector->Tlen, vector->P, vector->Plen, o_ct_test);
+	OK |=
+	    check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+	memcpy(ct_test, vector->C, vector->Plen);
+	memcpy(pt_test, vector->P, vector->Plen);
+	memset(vector->P, 0, vector->Plen);
+	memcpy(T_test, vector->T, vector->Tlen);
+	memset(vector->T, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+	aes_gcm_dec_256(gkey, gctx, vector->P, vector->C, vector->Plen,
+			IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)");
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "ISA-L decrypted ISA-L plain text (P)");
+	memset(vector->P, 0, vector->Plen);
+	aes_gcm_dec_256(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+			IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "ISA-L decrypted OpenSSL plain text (P)");
+	result =
+	    openssl_aes_256_gcm_dec(vector->K, vector->IV,
+				    vector->IVlen, vector->A, vector->Alen,
+				    vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+	if (-1 == result)
+		printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+	OK |= (-1 == result);
+	free(T_test);
+	free(o_T_test);
+	free(IV_c);
+	free(pt_test);
+	free(ct_test);
+	free(o_ct_test);
+
+	return OK;
+}
+
+int check_256_strm_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+			  gcm_vector * vector, int test_len)
+{
+	uint8_t *pt_test = NULL;
+	uint8_t *ct_test = NULL;
+	uint8_t *o_ct_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *o_T_test = NULL;
+	uint8_t *stream = NULL;
+	uint64_t IV_alloc_len = 0;
+	int result;
+	int OK = 0;
+	uint32_t last_break;
+	int i;
+	uint8_t *rand_data = NULL;
+	uint64_t length;
+
+	rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+	printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+	       (int)vector->Klen,
+	       (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+	printf(".");
+#endif
+	// Allocate space for the calculated ciphertext
+	if (vector->Plen != 0) {
+		pt_test = malloc(vector->Plen);
+		ct_test = malloc(vector->Plen);
+		o_ct_test = malloc(vector->Plen);
+		if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+			fprintf(stderr, "Can't allocate ciphertext memory\n");
+			return 1;
+		}
+	}
+	IV_alloc_len = vector->IVlen;
+	// Allocate space for the calculated ciphertext
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	o_T_test = malloc(vector->Tlen);
+	if ((T_test == NULL) || (o_T_test == NULL)) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_256(vector->K, gkey);
+
+	////
+	// ISA-l Encrypt
+	////
+	aes_gcm_init_256(gkey, gctx, IV_c, vector->A, vector->Alen);
+
+	last_break = 0;
+	i = (rand() % test_len / 32) & ALIGNMENT_MASK;
+	while (i < (vector->Plen)) {
+		if (i - last_break != 0) {
+			stream = malloc(i - last_break);
+			memcpy(stream, vector->P + last_break, i - last_break);
+		}
+
+		aes_gcm_enc_256_update(gkey, gctx, vector->C + last_break, stream,
+				       i - last_break);
+		if (i - last_break != 0)
+			free(stream);
+
+		if (rand() % 1024 == 0) {
+			length = rand() % 100;
+			mk_rand_data(rand_data, length);
+			SHA1(rand_data, length, rand_data);
+		}
+		last_break = i;
+		i += (rand() % test_len / 32) & ALIGNMENT_MASK;
+
+	}
+	aes_gcm_enc_256_update(gkey, gctx, vector->C + last_break, vector->P + last_break,
+			       vector->Plen - last_break);
+	if (gctx->in_length != vector->Plen)
+		printf("%lu, %lu\n", gctx->in_length, vector->Plen);
+	aes_gcm_enc_256_finalize(gkey, gctx, vector->T, vector->Tlen);
+
+	openssl_aes_256_gcm_enc(vector->K, vector->IV,
+				vector->IVlen, vector->A, vector->Alen, o_T_test,
+				vector->Tlen, vector->P, vector->Plen, o_ct_test);
+	OK |=
+	    check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+	memcpy(ct_test, vector->C, vector->Plen);
+	memcpy(pt_test, vector->P, vector->Plen);
+	memset(vector->P, 0, vector->Plen);
+	memcpy(T_test, vector->T, vector->Tlen);
+	memset(vector->T, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+
+	last_break = 0;
+	i += (rand() % test_len / 32) & ALIGNMENT_MASK;
+	aes_gcm_init_256(gkey, gctx, IV_c, vector->A, vector->Alen);
+	while (i < (vector->Plen)) {
+		if (i - last_break != 0) {
+			stream = malloc(i - last_break);
+			memcpy(stream, vector->C + last_break, i - last_break);
+		}
+
+		aes_gcm_dec_256_update(gkey, gctx, vector->P + last_break, stream,
+				       i - last_break);
+		if (i - last_break != 0)
+			free(stream);
+
+		if (rand() % 1024 == 0) {
+			length = rand() % 100;
+
+			mk_rand_data(rand_data, length);
+			SHA1(rand_data, length, rand_data);
+		}
+
+		last_break = i;
+		i += (rand() % test_len / 32) & ALIGNMENT_MASK;
+
+	}
+	aes_gcm_dec_256_update(gkey, gctx, vector->P + last_break, vector->C + last_break,
+			       vector->Plen - last_break);
+	aes_gcm_dec_256_finalize(gkey, gctx, vector->T, vector->Tlen);
+
+	OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)");
+	OK |=
+	    check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "ISA-L decrypted ISA-L plain text (P)");
+	memset(vector->P, 0, vector->Plen);
+	aes_gcm_dec_256(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+			IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "ISA-L decrypted OpenSSL plain text (P)");
+	result =
+	    openssl_aes_256_gcm_dec(vector->K, vector->IV,
+				    vector->IVlen, vector->A, vector->Alen,
+				    vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+	if (-1 == result)
+		printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+	OK |= (-1 == result);
+	free(T_test);
+	free(o_T_test);
+	free(IV_c);
+	free(pt_test);
+	free(ct_test);
+	free(o_ct_test);
+
+	return OK;
+}
+
+int test_gcm_strm_efence(void)
+{
+	gcm_vector test;
+	int tag_len = 8;
+	int t = 0;
+	struct gcm_key_data *gkey = NULL;
+	struct gcm_context_data *gctx = NULL;
+
+	gkey = malloc(sizeof(struct gcm_key_data));
+	gctx = malloc(sizeof(struct gcm_context_data));
+	if (NULL == gkey || NULL == gctx)
+		return 1;
+
+	printf("AES GCM random efence test vectors with random stream:");
+	for (t = 0; RANDOMS > t; t++) {
+		int Plen = (rand() % TEST_LEN);
+		//lengths must be a multiple of 4 bytes
+		int aad_len = (rand() % TEST_LEN);
+		int offset = (rand() % MAX_UNALIGNED);
+		if (offset == 0 && aad_len == 0)
+			offset = OFFSET_BASE_VALUE;
+
+		if (0 == (t % 25))
+			printf("\n");
+		if (0 == (t % 10))
+			fflush(0);
+		test.P = NULL;
+		test.C = NULL;
+		test.A = NULL;
+		test.T = NULL;
+		test.Plen = Plen;
+		if (test.Plen + offset != 0) {
+			test.P = malloc(test.Plen + offset);
+			test.C = malloc(test.Plen + offset);
+		} else {	//This else clause is here because openssl 1.0.1k does not handle NULL pointers
+			test.P = malloc(16);
+			test.C = malloc(16);
+		}
+		test.K = malloc(GCM_128_KEY_LEN + offset);
+		test.Klen = GCM_128_KEY_LEN;
+		test.IV = malloc(GCM_IV_DATA_LEN + offset);
+		test.IVlen = GCM_IV_DATA_LEN;
+		test.A = malloc(aad_len + offset);
+		test.Alen = aad_len;
+		test.T = malloc(MAX_TAG_LEN + offset);
+
+		if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+		    || (NULL == test.IV)) {
+			printf("malloc of testsize:0x%x failed\n", Plen);
+			return 1;
+		}
+
+		test.P += offset;
+		test.C += offset;
+		test.K += offset;
+		test.IV += offset;
+		test.A += offset;
+		test.T += offset;
+
+		mk_rand_data(test.P, test.Plen);
+		mk_rand_data(test.K, test.Klen);
+		mk_rand_data(test.IV, test.IVlen);
+		mk_rand_data(test.A, test.Alen);
+
+		// single Key length of 128bits/16bytes supported
+		// single IV length of 96bits/12bytes supported
+		// Tag lengths of 8, 12 or 16
+		for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+			test.Tlen = tag_len;
+			if (0 != check_strm_vector_efence(gkey, gctx, &test))
+				return 1;
+			tag_len += 4;	//supported lengths are 8, 12 or 16
+		}
+		test.A -= offset;
+		free(test.A);
+		test.C -= offset;
+		free(test.C);
+		test.IV -= offset;
+		free(test.IV);
+		test.K -= offset;
+		free(test.K);
+		test.P -= offset;
+		free(test.P);
+		test.T -= offset;
+		free(test.T);
+	}
+	printf("\n");
+	free(gkey);
+	free(gctx);
+	return 0;
+}
+
+int test_gcm_strm_combinations(int test_len)
+{
+	gcm_vector test;
+	int tag_len = 8;
+	int t = 0;
+	uint8_t *gkeytemp = NULL;
+	struct gcm_key_data *gkey = NULL;
+	struct gcm_context_data *gctx = NULL;
+
+	gkeytemp = malloc(sizeof(struct gcm_key_data) + 16);
+	gctx = malloc(sizeof(struct gcm_context_data));
+	gkey = (struct gcm_key_data *)(gkeytemp + rand() % 16);
+	if (NULL == gkey || NULL == gctx)
+		return 1;
+
+	printf("AES GCM random test vectors with random stream of average size %d:",
+	       test_len / 64);
+	for (t = 0; RANDOMS > t; t++) {
+		int Plen = 0;	// (rand() % test_len);
+		//lengths must be a multiple of 4 bytes
+		int aad_len = (rand() % test_len);
+		int offset = (rand() % MAX_UNALIGNED);
+		if (offset == 0 && aad_len == 0)
+			offset = OFFSET_BASE_VALUE;
+
+		if (0 == (t % 25))
+			printf("\n");
+		if (0 == (t % 10))
+			fflush(0);
+		test.P = NULL;
+		test.C = NULL;
+		test.A = NULL;
+		test.T = NULL;
+		test.Plen = Plen;
+		if (test.Plen + offset != 0) {
+			test.P = malloc(test.Plen + offset);
+			test.C = malloc(test.Plen + offset);
+		} else {	//This else clause is here because openssl 1.0.1k does not handle NULL pointers
+			test.P = malloc(16);
+			test.C = malloc(16);
+		}
+		test.K = malloc(GCM_128_KEY_LEN + offset);
+		test.Klen = GCM_128_KEY_LEN;
+		test.IV = malloc(GCM_IV_DATA_LEN + offset);
+		test.IVlen = GCM_IV_DATA_LEN;
+		test.A = malloc(aad_len + offset);
+
+		test.Alen = aad_len;
+		test.T = malloc(MAX_TAG_LEN + offset);
+
+		if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+		    || (NULL == test.IV)) {
+			printf("malloc of testsize:0x%x failed\n", Plen);
+			return 1;
+		}
+
+		test.P += offset;
+		test.C += offset;
+		test.K += offset;
+		test.IV += offset;
+		test.A += offset;
+		test.T += offset;
+
+		mk_rand_data(test.P, test.Plen);
+		mk_rand_data(test.K, test.Klen);
+		mk_rand_data(test.IV, test.IVlen);
+		mk_rand_data(test.A, test.Alen);
+
+		// single Key length of 128bits/16bytes supported
+		// single IV length of 96bits/12bytes supported
+		// Tag lengths of 8, 12 or 16
+		for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+			test.Tlen = tag_len;
+			if (0 != check_strm_vector(gkey, gctx, &test, test_len))
+				return 1;
+			tag_len += 4;	//supported lengths are 8, 12 or 16
+		}
+		test.A -= offset;
+		free(test.A);
+		test.C -= offset;
+		free(test.C);
+		test.IV -= offset;
+		free(test.IV);
+		test.K -= offset;
+		free(test.K);
+		test.P -= offset;
+		free(test.P);
+		test.T -= offset;
+		free(test.T);
+	}
+	printf("\n");
+	free(gkeytemp);
+	free(gctx);
+	return 0;
+}
+
+int test_gcm_combinations(void)
+{
+	gcm_vector test;
+	int tag_len = 8;
+	int t = 0;
+	struct gcm_key_data *gkey = NULL;
+	struct gcm_context_data *gctx = NULL;
+
+	gkey = malloc(sizeof(struct gcm_key_data));
+	gctx = malloc(sizeof(struct gcm_context_data));
+	if (NULL == gkey || NULL == gctx)
+		return 1;
+
+	printf("AES GCM random test vectors:");
+	for (t = 0; RANDOMS > t; t++) {
+		int Plen = (rand() % TEST_LEN);
+		//lengths must be a multiple of 4 bytes
+		int aad_len = (rand() % TEST_LEN);
+		int offset = (rand() % MAX_UNALIGNED);
+		if (offset == 0 && aad_len == 0)
+			offset = OFFSET_BASE_VALUE;
+
+		if (0 == (t % 25))
+			printf("\n");
+		if (0 == (t % 10))
+			fflush(0);
+		test.P = NULL;
+		test.C = NULL;
+		test.A = NULL;
+		test.T = NULL;
+		test.Plen = Plen;
+		if (test.Plen + offset != 0) {
+			test.P = malloc(test.Plen + offset);
+			test.C = malloc(test.Plen + offset);
+		} else {	//This else clause is here because openssl 1.0.1k does not handle NULL pointers
+			test.P = malloc(16);
+			test.C = malloc(16);
+		}
+		test.K = malloc(GCM_128_KEY_LEN + offset);
+		test.Klen = GCM_128_KEY_LEN;
+		test.IV = malloc(GCM_IV_DATA_LEN + offset);
+		test.IVlen = GCM_IV_DATA_LEN;
+		test.A = malloc(aad_len + offset);
+
+		test.Alen = aad_len;
+		test.T = malloc(MAX_TAG_LEN + offset);
+
+		if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+		    || (NULL == test.IV)) {
+			printf("malloc of testsize:0x%x failed\n", Plen);
+			return 1;
+		}
+
+		test.P += offset;
+		test.C += offset;
+		test.K += offset;
+		test.IV += offset;
+		test.A += offset;
+		test.T += offset;
+
+		mk_rand_data(test.P, test.Plen);
+		mk_rand_data(test.K, test.Klen);
+		mk_rand_data(test.IV, test.IVlen);
+		mk_rand_data(test.A, test.Alen);
+
+		// single Key length of 128bits/16bytes supported
+		// single IV length of 96bits/12bytes supported
+		// Tag lengths of 8, 12 or 16
+		for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+			test.Tlen = tag_len;
+			if (0 != check_vector(gkey, gctx, &test))
+				return 1;
+			tag_len += 4;	//supported lengths are 8, 12 or 16
+		}
+		test.A -= offset;
+		free(test.A);
+		test.C -= offset;
+		free(test.C);
+		test.IV -= offset;
+		free(test.IV);
+		test.K -= offset;
+		free(test.K);
+		test.P -= offset;
+		free(test.P);
+		test.T -= offset;
+		free(test.T);
+	}
+	printf("\n");
+	free(gkey);
+	free(gctx);
+	return 0;
+}
+
+int test_gcm256_combinations(void)
+{
+	gcm_vector test;
+	int tag_len = 8;
+	int t = 0;
+	struct gcm_key_data *gkey = NULL;
+	struct gcm_context_data *gctx = NULL;
+
+	gkey = malloc(sizeof(struct gcm_key_data));
+	gctx = malloc(sizeof(struct gcm_context_data));
+	if (NULL == gkey || NULL == gctx)
+		return 1;
+
+	printf("AES-GCM-256 random test vectors:");
+	for (t = 0; RANDOMS > t; t++) {
+		int Plen = (rand() % TEST_LEN);
+		//lengths must be a multiple of 4 bytes
+		int aad_len = (rand() % TEST_LEN);
+		int offset = (rand() % MAX_UNALIGNED);
+		if (offset == 0 && aad_len == 0)
+			offset = OFFSET_BASE_VALUE;
+
+		if (0 == (t % 25))
+			printf("\n");
+		if (0 == (t % 10))
+			fflush(0);
+		test.P = NULL;
+		test.C = NULL;
+		test.A = NULL;
+		test.T = NULL;
+		test.Plen = Plen;
+		if (test.Plen + offset != 0) {
+			test.P = malloc(test.Plen + offset);
+			test.C = malloc(test.Plen + offset);
+		} else {	//This else clause is here because openssl 1.0.1k does not handle NULL pointers
+			test.P = malloc(16);
+			test.C = malloc(16);
+		}
+		test.K = malloc(GCM_256_KEY_LEN + offset);
+		test.Klen = GCM_256_KEY_LEN;
+		test.IV = malloc(GCM_IV_DATA_LEN + offset);
+		test.IVlen = GCM_IV_DATA_LEN;
+		test.A = malloc(aad_len + offset);
+
+		test.Alen = aad_len;
+		test.T = malloc(MAX_TAG_LEN + offset);
+
+		if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+		    || (NULL == test.IV)) {
+			printf("malloc of testsize:0x%x failed\n", Plen);
+			return 1;
+		}
+
+		test.P += offset;
+		test.C += offset;
+		test.K += offset;
+		test.IV += offset;
+		test.A += offset;
+		test.T += offset;
+
+		mk_rand_data(test.P, test.Plen);
+		mk_rand_data(test.K, test.Klen);
+		mk_rand_data(test.IV, test.IVlen);
+		mk_rand_data(test.A, test.Alen);
+
+		// single Key length of 128bits/16bytes supported
+		// single IV length of 96bits/12bytes supported
+		// Tag lengths of 8, 12 or 16
+		for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+			test.Tlen = tag_len;
+			if (0 != check_256_vector(gkey, gctx, &test))
+				return 1;
+			tag_len += 4;	//supported lengths are 8, 12 or 16
+		}
+		test.A -= offset;
+		free(test.A);
+		test.C -= offset;
+		free(test.C);
+		test.IV -= offset;
+		free(test.IV);
+		test.K -= offset;
+		free(test.K);
+		test.P -= offset;
+		free(test.P);
+		test.T -= offset;
+		free(test.T);
+	}
+	printf("\n");
+	free(gkey);
+	free(gctx);
+	return 0;
+}
+
+int test_gcm256_strm_combinations(int test_len)
+{
+	gcm_vector test;
+	int tag_len = 8;
+	int t = 0;
+	uint8_t *gkeytemp = NULL;
+	struct gcm_key_data *gkey = NULL;
+	struct gcm_context_data *gctx = NULL;
+
+	gkeytemp = malloc(sizeof(struct gcm_key_data) + 16);
+	gctx = malloc(sizeof(struct gcm_context_data));
+	gkey = (struct gcm_key_data *)(gkeytemp + rand() % 16);
+	if (NULL == gkey || NULL == gctx)
+		return 1;
+
+	printf("AES-GCM-256 random test vectors with random stream of average size %d:",
+	       test_len / 64);
+	for (t = 0; RANDOMS > t; t++) {
+		int Plen = (rand() % test_len);
+		//lengths must be a multiple of 4 bytes
+		int aad_len = (rand() % test_len);
+		int offset = (rand() % MAX_UNALIGNED);
+		if (offset == 0 && aad_len == 0)
+			offset = OFFSET_BASE_VALUE;
+
+		if (0 == (t % 25))
+			printf("\n");
+		if (0 == (t % 10))
+			fflush(0);
+		test.P = NULL;
+		test.C = NULL;
+		test.A = NULL;
+		test.T = NULL;
+		test.Plen = Plen;
+		if (test.Plen + offset != 0) {
+			test.P = malloc(test.Plen + offset);
+			test.C = malloc(test.Plen + offset);
+		} else {	//This else clause is here because openssl 1.0.1k does not handle NULL pointers
+			test.P = malloc(16);
+			test.C = malloc(16);
+		}
+		test.K = malloc(GCM_256_KEY_LEN + offset);
+		test.Klen = GCM_256_KEY_LEN;
+		test.IV = malloc(GCM_IV_DATA_LEN + offset);
+		test.IVlen = GCM_IV_DATA_LEN;
+		test.A = malloc(aad_len + offset);
+
+		test.Alen = aad_len;
+		test.T = malloc(MAX_TAG_LEN + offset);
+
+		if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+		    || (NULL == test.IV)) {
+			printf("malloc of testsize:0x%x failed\n", Plen);
+			return 1;
+		}
+
+		test.P += offset;
+		test.C += offset;
+		test.K += offset;
+		test.IV += offset;
+		test.A += offset;
+		test.T += offset;
+
+		mk_rand_data(test.P, test.Plen);
+		mk_rand_data(test.K, test.Klen);
+		mk_rand_data(test.IV, test.IVlen);
+		mk_rand_data(test.A, test.Alen);
+
+		// single Key length of 128bits/16bytes supported
+		// single IV length of 96bits/12bytes supported
+		// Tag lengths of 8, 12 or 16
+		for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+			test.Tlen = tag_len;
+			if (0 != check_256_strm_vector(gkey, gctx, &test, test_len))
+				return 1;
+			tag_len += 4;	//supported lengths are 8, 12 or 16
+		}
+		test.A -= offset;
+		free(test.A);
+		test.C -= offset;
+		free(test.C);
+		test.IV -= offset;
+		free(test.IV);
+		test.K -= offset;
+		free(test.K);
+		test.P -= offset;
+		free(test.P);
+		test.T -= offset;
+		free(test.T);
+	}
+	printf("\n");
+	free(gkeytemp);
+	free(gctx);
+	return 0;
+}
+
+//
+// place all data to end at a page boundary to check for read past the end
+//
+int test_gcm_efence(void)
+{
+	gcm_vector test;
+	int offset = 0;
+	gcm_key_size key_len;
+	struct gcm_key_data *gkey = NULL;
+	struct gcm_context_data *gctx = NULL;
+	uint8_t *P, *C, *K, *IV, *A, *T;
+
+	gkey = malloc(sizeof(struct gcm_key_data));
+	gctx = malloc(sizeof(struct gcm_context_data));
+	P = malloc(PAGE_LEN);
+	C = malloc(PAGE_LEN);
+	K = malloc(PAGE_LEN);
+	IV = malloc(PAGE_LEN);
+	A = malloc(PAGE_LEN);
+	T = malloc(PAGE_LEN);
+	if ((NULL == P) || (NULL == C) || (NULL == K) || (NULL == IV) || (NULL == A)
+	    || (NULL == T) || (NULL == gkey) || (NULL == gctx)) {
+		printf("malloc of testsize:0x%x failed\n", PAGE_LEN);
+		return -1;
+	}
+
+	test.Plen = PAGE_LEN / 2;
+	// place buffers to end at page boundary
+	test.IVlen = GCM_IV_DATA_LEN;
+	test.Alen = test.Plen;
+	test.Tlen = MAX_TAG_LEN;
+
+	printf("AES GCM efence test vectors:");
+	for (key_len = GCM_128_KEY_LEN; GCM_256_KEY_LEN >= key_len;
+	     key_len += (GCM_256_KEY_LEN - GCM_128_KEY_LEN)) {
+		test.Klen = key_len;
+		for (offset = 0; MAX_UNALIGNED > offset; offset++) {
+			if (0 == (offset % 80))
+				printf("\n");
+			// move the start and size of the data block towards the end of the page
+			test.Plen = (PAGE_LEN / 2) - offset;
+			test.Alen = (PAGE_LEN / 4) - (offset * 4);	//lengths must be a multiple of 4 bytes
+			//Place data at end of page
+			test.P = P + PAGE_LEN - test.Plen;
+			test.C = C + PAGE_LEN - test.Plen;
+			test.K = K + PAGE_LEN - test.Klen;
+			test.IV = IV + PAGE_LEN - test.IVlen;
+			test.A = A + PAGE_LEN - test.Alen;
+			test.T = T + PAGE_LEN - test.Tlen;
+
+			mk_rand_data(test.P, test.Plen);
+			mk_rand_data(test.K, test.Klen);
+			mk_rand_data(test.IV, test.IVlen);
+			mk_rand_data(test.A, test.Alen);
+			if (GCM_128_KEY_LEN == key_len) {
+				if (0 != check_vector(gkey, gctx, &test))
+					return 1;
+			} else {
+				if (0 != check_256_vector(gkey, gctx, &test))
+					return 1;
+			}
+		}
+	}
+	free(gkey);
+	free(gctx);
+	free(P);
+	free(C);
+	free(K);
+	free(IV);
+	free(A);
+	free(T);
+
+	printf("\n");
+	return 0;
+}
+
+int test_gcm128_std_vectors(gcm_vector const *vector)
+{
+	struct gcm_key_data gkey;
+	struct gcm_context_data gctx;
+	int OK = 0;
+	// Temporary array for the calculated vectors
+	uint8_t *ct_test = NULL;
+	uint8_t *pt_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *T2_test = NULL;
+	uint64_t IV_alloc_len = 0;
+	int result;
+
+#ifdef GCM_VECTORS_VERBOSE
+	printf("AES-GCM-128:\n");
+#endif
+
+	// Allocate space for the calculated ciphertext
+	ct_test = malloc(vector->Plen);
+	if (ct_test == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	// Allocate space for the calculated ciphertext
+	pt_test = malloc(vector->Plen);
+	if (pt_test == NULL) {
+		fprintf(stderr, "Can't allocate plaintext memory\n");
+		return 1;
+	}
+	IV_alloc_len = vector->IVlen;
+	// Allocate space for the calculated ciphertext
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	T2_test = malloc(vector->Tlen);
+	if ((T_test == NULL) || (T2_test == NULL)) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_128(vector->K, &gkey);
+#ifdef GCM_VECTORS_VERBOSE
+	dump_gcm_data(&gkey);
+#endif
+
+	////
+	// ISA-l Encrypt
+	////
+	aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+			IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+	openssl_aes_gcm_enc(vector->K, vector->IV,
+			    vector->IVlen, vector->A,
+			    vector->Alen, pt_test, vector->Tlen,
+			    vector->P, vector->Plen, ct_test);
+	OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L tag (T)");
+	// test of in-place encrypt
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_enc_128(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+			vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->C, vector->Plen,
+		       "ISA-L encrypted cypher text(in-place)");
+	memset(ct_test, 0, vector->Plen);
+	memset(T_test, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+	aes_gcm_dec_128(&gkey, &gctx, pt_test, vector->C, vector->Plen,
+			IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	// GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+	// test in in-place decrypt
+	memcpy(ct_test, vector->C, vector->Plen);
+	aes_gcm_dec_128(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+			vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+	OK |=
+	    check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+	// ISA-L enc -> ISA-L dec
+	aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+			IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	memset(pt_test, 0, vector->Plen);
+	aes_gcm_dec_128(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+			vector->A, vector->Alen, T2_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "ISA-L self decrypted plain text (P)");
+	OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+	// OpenSSl enc -> ISA-L dec
+	openssl_aes_gcm_enc(vector->K, vector->IV,
+			    vector->IVlen, vector->A,
+			    vector->Alen, T_test, vector->Tlen,
+			    vector->P, vector->Plen, ct_test);
+	OK |=
+	    check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)");
+	memset(pt_test, 0, vector->Plen);
+	aes_gcm_dec_128(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+			vector->A, vector->Alen, T2_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "OpenSSL->ISA-L decrypted plain text (P)");
+	OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)");
+	// ISA-L enc -> OpenSSl dec
+	aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+			IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	memset(pt_test, 0, vector->Plen);
+	result =
+	    openssl_aes_gcm_dec(vector->K, vector->IV,
+				vector->IVlen, vector->A,
+				vector->Alen, T_test, vector->Tlen,
+				ct_test, vector->Plen, pt_test);
+	if (-1 == result)
+		printf("  ISA-L->OpenSSL decryption failed Authentication\n");
+	OK |= (-1 == result);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)");
+	if (NULL != ct_test)
+		free(ct_test);
+	if (NULL != pt_test)
+		free(pt_test);
+	if (NULL != IV_c)
+		free(IV_c);
+	if (NULL != T_test)
+		free(T_test);
+	if (NULL != T2_test)
+		free(T2_test);
+
+	return OK;
+}
+
+int test_gcm256_std_vectors(gcm_vector const *vector)
+{
+	struct gcm_key_data gkey;
+	struct gcm_context_data gctx;
+	int OK = 0;
+	// Temporary array for the calculated vectors
+	uint8_t *ct_test = NULL;
+	uint8_t *pt_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *T2_test = NULL;
+	uint64_t IV_alloc_len = 0;
+	int result;
+
+#ifdef GCM_VECTORS_VERBOSE
+	printf("AES-GCM-256:\n");
+#endif
+
+	// Allocate space for the calculated ciphertext
+	ct_test = malloc(vector->Plen);
+	// Allocate space for the calculated ciphertext
+	pt_test = malloc(vector->Plen);
+	if ((ct_test == NULL) || (pt_test == NULL)) {
+		fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+		return 1;
+	}
+	IV_alloc_len = vector->IVlen;
+	// Allocate space for the calculated ciphertext
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate ciphertext memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	T2_test = malloc(vector->Tlen);
+	if (T_test == NULL) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_256(vector->K, &gkey);
+#ifdef GCM_VECTORS_VERBOSE
+	dump_gcm_data(&gkey);
+#endif
+
+	////
+	// ISA-l Encrypt
+	////
+	memset(ct_test, 0, vector->Plen);
+	aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+			IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+	openssl_aes_256_gcm_enc(vector->K, vector->IV,
+				vector->IVlen, vector->A,
+				vector->Alen, pt_test, vector->Tlen,
+				vector->P, vector->Plen, ct_test);
+	OK |= check_data(ct_test, vector->C, vector->Tlen, "OpenSSL vs KA - cypher text (C)");
+	OK |= check_data(pt_test, vector->T, vector->Tlen, "OpenSSL vs KA - tag (T)");
+	OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L - tag (T)");
+	// test of in-place encrypt
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_enc_256(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+			vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->C, vector->Plen,
+		       "ISA-L encrypted cypher text(in-place)");
+	memset(ct_test, 0, vector->Plen);
+	memset(T_test, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+	aes_gcm_dec_256(&gkey, &gctx, pt_test, vector->C, vector->Plen,
+			IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	// GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+	// test in in-place decrypt
+	memcpy(ct_test, vector->C, vector->Plen);
+	aes_gcm_dec_256(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+			vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+	OK |=
+	    check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+	// ISA-L enc -> ISA-L dec
+	aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+			IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	memset(pt_test, 0, vector->Plen);
+	aes_gcm_dec_256(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+			vector->A, vector->Alen, T2_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "ISA-L self decrypted plain text (P)");
+	OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+	// OpenSSl enc -> ISA-L dec
+	openssl_aes_256_gcm_enc(vector->K, vector->IV,
+				vector->IVlen, vector->A,
+				vector->Alen, T_test, vector->Tlen,
+				vector->P, vector->Plen, ct_test);
+	OK |=
+	    check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)");
+	memset(pt_test, 0, vector->Plen);
+	aes_gcm_dec_256(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+			vector->A, vector->Alen, T2_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "OpenSSL->ISA-L decrypted plain text (P)");
+	OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)");
+	// ISA-L enc -> OpenSSl dec
+	aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+			IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	memset(pt_test, 0, vector->Plen);
+	result =
+	    openssl_aes_256_gcm_dec(vector->K, vector->IV,
+				    vector->IVlen, vector->A,
+				    vector->Alen, T_test, vector->Tlen,
+				    ct_test, vector->Plen, pt_test);
+	if (-1 == result)
+		printf("  ISA-L->OpenSSL decryption failed Authentication\n");
+	OK |= (-1 == result);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)");
+	if (NULL != ct_test)
+		free(ct_test);
+	if (NULL != pt_test)
+		free(pt_test);
+	if (NULL != IV_c)
+		free(IV_c);
+	if (NULL != T_test)
+		free(T_test);
+	if (NULL != T2_test)
+		free(T2_test);
+
+	return OK;
+}
+
+int test_gcm_std_vectors(void)
+{
+	int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]);
+	int vect;
+	int OK = 0;
+
+	printf("AES-GCM standard test vectors:\n");
+	for (vect = 0; vect < vectors_cnt; vect++) {
+#ifdef GCM_VECTORS_VERBOSE
+		printf
+		    ("Standard vector %d/%d  Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+		     vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen,
+		     (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen,
+		     (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen);
+#else
+		printf(".");
+#endif
+
+		if (BITS_128 == gcm_vectors[vect].Klen) {
+			OK |= test_gcm128_std_vectors(&gcm_vectors[vect]);
+		} else {
+			OK |= test_gcm256_std_vectors(&gcm_vectors[vect]);
+		}
+		if (0 != OK)
+			return OK;
+	}
+	printf("\n");
+	return OK;
+}
+
+// The length of the data is set to length. The first stream is from 0 to start. After
+// that the data is broken into breaks chunks of equal size (except possibly the last
+// one due to divisibility).
+int test_gcm_strm_combinations2(int length, int start, int breaks)
+{
+	gcm_vector test;
+	int tag_len = 8;
+	int t = 0;
+	struct gcm_key_data *gkey = NULL;
+	struct gcm_context_data *gctx = NULL;
+
+	gkey = malloc(sizeof(struct gcm_key_data));
+	gctx = malloc(sizeof(struct gcm_context_data));
+	if (NULL == gkey || NULL == gctx)
+		return 1;
+
+	printf("AES GCM random test vectors of length %d and stream with %d breaks:", length,
+	       breaks + 1);
+	for (t = 0; RANDOMS > t; t++) {
+		int Plen = length;
+		//lengths must be a multiple of 4 bytes
+		int aad_len = (rand() % TEST_LEN);
+		int offset = (rand() % MAX_UNALIGNED);
+		if (offset == 0 && aad_len == 0)
+			offset = OFFSET_BASE_VALUE;
+
+		if (0 == (t % 25))
+			printf("\n");
+		if (0 == (t % 10))
+			fflush(0);
+		test.P = NULL;
+		test.C = NULL;
+		test.A = NULL;
+		test.T = NULL;
+		test.Plen = Plen;
+		if (test.Plen + offset != 0) {
+			test.P = malloc(test.Plen + offset);
+			test.C = malloc(test.Plen + offset);
+		} else {	//This else clause is here because openssl 1.0.1k does not handle NULL pointers
+			test.P = malloc(16);
+			test.C = malloc(16);
+		}
+		test.K = malloc(GCM_128_KEY_LEN + offset);
+		test.Klen = GCM_128_KEY_LEN;
+		test.IV = malloc(GCM_IV_DATA_LEN + offset);
+		test.IVlen = GCM_IV_DATA_LEN;
+		test.A = malloc(aad_len + offset);
+
+		test.Alen = aad_len;
+		test.T = malloc(MAX_TAG_LEN + offset);
+
+		if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+		    || (NULL == test.IV)) {
+			printf("malloc of testsize:0x%x failed\n", Plen);
+			return 1;
+		}
+
+		test.P += offset;
+		test.C += offset;
+		test.K += offset;
+		test.IV += offset;
+		test.A += offset;
+		test.T += offset;
+
+		mk_rand_data(test.P, test.Plen);
+		mk_rand_data(test.K, test.Klen);
+		mk_rand_data(test.IV, test.IVlen);
+		mk_rand_data(test.A, test.Alen);
+
+		// single Key length of 128bits/16bytes supported
+		// single IV length of 96bits/12bytes supported
+		// Tag lengths of 8, 12 or 16
+		for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+			test.Tlen = tag_len;
+			if (0 != check_strm_vector2(gkey, gctx, &test, length, start, breaks))
+				return 1;
+			tag_len += 4;	//supported lengths are 8, 12 or 16
+		}
+		test.A -= offset;
+		free(test.A);
+		test.C -= offset;
+		free(test.C);
+		test.IV -= offset;
+		free(test.IV);
+		test.K -= offset;
+		free(test.K);
+		test.P -= offset;
+		free(test.P);
+		test.T -= offset;
+		free(test.T);
+	}
+	printf("\n");
+	free(gkey);
+	free(gctx);
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	int errors = 0;
+	int seed;
+
+	if (argc == 1)
+		seed = TEST_SEED;
+	else
+		seed = atoi(argv[1]);
+
+	srand(seed);
+	printf("SEED: %d\n", seed);
+
+	errors += test_gcm_std_vectors();
+	errors += test_gcm256_combinations();
+	errors += test_gcm_combinations();
+	errors += test_gcm_efence();
+	errors += test_gcm256_strm_combinations(TEST_LEN);
+	errors += test_gcm_strm_combinations(TEST_LEN);
+	errors += test_gcm256_strm_combinations(1024);
+	errors += test_gcm_strm_combinations(1024);
+	errors += test_gcm_strm_efence();
+	errors += test_gcm_strm_combinations2(1024, 0, 1024);
+
+	if (0 == errors)
+		printf("...Pass\n");
+	else
+		printf("...Fail\n");
+
+	return errors;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c
new file mode 100644
index 000000000..54581d6b6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c
@@ -0,0 +1,659 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>		// for memcmp
+#include <aes_gcm.h>
+#include "gcm_vectors.h"
+#include "types.h"
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+	int mismatch;
+	int OK = 0;
+
+	mismatch = memcmp(test, expected, len);
+	if (mismatch) {
+		OK = 1;
+		printf("  expected results don't match %s \t\t", data_name);
+		{
+			uint64_t a;
+			for (a = 0; a < len; a++) {
+				if (test[a] != expected[a]) {
+					printf(" '%x' != '%x' at %lx of %lx\n",
+					       test[a], expected[a], a, len);
+					break;
+				}
+			}
+		}
+	}
+	return OK;
+}
+
+int test_gcm128_std_vectors(gcm_vector const *vector)
+{
+	struct gcm_key_data gkey;
+	struct gcm_context_data gctx;
+	int OK = 0;
+	// Temporary array for the calculated vectors
+	uint8_t *ct_test = NULL;
+	uint8_t *pt_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *T2_test = NULL;
+	uint64_t IV_alloc_len = 0;
+
+	// Allocate space for the calculated ciphertext
+	ct_test = malloc(vector->Plen);
+	// Allocate space for the plain text
+	pt_test = malloc(vector->Plen);
+	if ((ct_test == NULL) || (pt_test == NULL)) {
+		fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+		return 1;
+	}
+	IV_alloc_len = vector->IVlen;
+	// Allocate space for the IV
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate IV memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	T2_test = malloc(vector->Tlen);
+	if ((T_test == NULL) || (T2_test == NULL)) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_128(vector->K, &gkey);
+
+	////
+	// ISA-l Encrypt
+	////
+	aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+			IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+	// test of in-place encrypt
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_enc_128(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+			vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(pt_test, vector->C, vector->Plen,
+			 "ISA-L encrypted cypher text(in-place)");
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L encrypted tag T(in-place)");
+	memset(ct_test, 0, vector->Plen);
+	memset(T_test, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+	aes_gcm_dec_128(&gkey, &gctx, pt_test, vector->C, vector->Plen,
+			IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	// GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+	// test in in-place decrypt
+	memcpy(ct_test, vector->C, vector->Plen);
+	aes_gcm_dec_128(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+			vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+	OK |=
+	    check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+	// ISA-L enc -> ISA-L dec
+	aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+			IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	memset(pt_test, 0, vector->Plen);
+	aes_gcm_dec_128(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+			vector->A, vector->Alen, T2_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "ISA-L self decrypted plain text (P)");
+	OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+
+	memset(pt_test, 0, vector->Plen);
+
+	if (NULL != ct_test)
+		free(ct_test);
+	if (NULL != pt_test)
+		free(pt_test);
+	if (NULL != IV_c)
+		free(IV_c);
+	if (NULL != T_test)
+		free(T_test);
+	if (NULL != T2_test)
+		free(T2_test);
+
+	return OK;
+}
+
+int test_gcm256_std_vectors(gcm_vector const *vector)
+{
+	struct gcm_key_data gkey;
+	struct gcm_context_data gctx;
+	int OK = 0;
+	// Temporary array for the calculated vectors
+	uint8_t *ct_test = NULL;
+	uint8_t *pt_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *T2_test = NULL;
+	uint64_t IV_alloc_len = 0;
+
+	// Allocate space for the calculated ciphertext
+	ct_test = malloc(vector->Plen);
+	// Allocate space for the plain text
+	pt_test = malloc(vector->Plen);
+	if ((ct_test == NULL) || (pt_test == NULL)) {
+		fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+		return 1;
+	}
+	IV_alloc_len = vector->IVlen;
+	// Allocate space for the IV
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate IV memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	T2_test = malloc(vector->Tlen);
+	if (T_test == NULL) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_256(vector->K, &gkey);
+
+	////
+	// ISA-l Encrypt
+	////
+	memset(ct_test, 0, vector->Plen);
+	aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+			IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+	// test of in-place encrypt
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_enc_256(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+			vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->C, vector->Plen,
+		       "ISA-L encrypted cypher text(in-place)");
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L encrypted tag T(in-place)");
+	memset(ct_test, 0, vector->Plen);
+	memset(T_test, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+	aes_gcm_dec_256(&gkey, &gctx, pt_test, vector->C, vector->Plen,
+			IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	// GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+	// test in in-place decrypt
+	memcpy(ct_test, vector->C, vector->Plen);
+	aes_gcm_dec_256(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+			vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+	OK |=
+	    check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+	// ISA-L enc -> ISA-L dec
+	aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+			IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	memset(pt_test, 0, vector->Plen);
+	aes_gcm_dec_256(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+			vector->A, vector->Alen, T2_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "ISA-L self decrypted plain text (P)");
+	OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+
+	if (NULL != ct_test)
+		free(ct_test);
+	if (NULL != pt_test)
+		free(pt_test);
+	if (NULL != IV_c)
+		free(IV_c);
+	if (NULL != T_test)
+		free(T_test);
+	if (NULL != T2_test)
+		free(T2_test);
+
+	return OK;
+}
+
+void aes_gcm_stream_enc_128(const struct gcm_key_data *key_data,
+			    struct gcm_context_data *context,
+			    uint8_t * out,
+			    uint8_t const *in,
+			    uint64_t len,
+			    uint8_t * iv,
+			    uint8_t const *aad,
+			    uint64_t aad_len, uint8_t * auth_tag, uint64_t auth_tag_len)
+{
+	aes_gcm_init_128(key_data, context, iv, aad, aad_len);
+	uint8_t test_sequence[] = { 1, 12, 22, 0, 1, 12, 16 };	//sum(test_sequence) > max_Plen in verctors
+	uint32_t i;
+	uint32_t offset = 0, dist;
+
+	for (i = 0; i < sizeof(test_sequence); i++) {
+		dist = test_sequence[i];
+		if (offset + dist > len)
+			break;
+		aes_gcm_enc_128_update(key_data, context, out + offset, in + offset, dist);
+		offset += dist;
+	}
+
+	aes_gcm_enc_128_update(key_data, context, out + offset, in + offset, len - offset);
+	aes_gcm_enc_128_finalize(key_data, context, auth_tag, auth_tag_len);
+}
+
+void aes_gcm_stream_dec_128(const struct gcm_key_data *key_data,
+			    struct gcm_context_data *context,
+			    uint8_t * out,
+			    uint8_t const *in,
+			    uint64_t len,
+			    uint8_t * iv,
+			    uint8_t const *aad,
+			    uint64_t aad_len, uint8_t * auth_tag, uint64_t auth_tag_len)
+{
+	aes_gcm_init_128(key_data, context, iv, aad, aad_len);
+	uint8_t test_sequence[] = { 1, 12, 22, 0, 1, 12, 16 };	//sum(test_sequence) > max_Plen in vectors
+	uint32_t i;
+	uint32_t offset = 0, dist;
+
+	for (i = 0; i < sizeof(test_sequence); i++) {
+		dist = test_sequence[i];
+		if (offset + dist > len)
+			break;
+		aes_gcm_dec_128_update(key_data, context, out + offset, in + offset, dist);
+		offset += dist;
+	}
+	aes_gcm_dec_128_update(key_data, context, out + offset, in + offset, len - offset);
+	aes_gcm_dec_128_finalize(key_data, context, auth_tag, auth_tag_len);
+
+}
+
+#if !defined(NT_LD) && !defined(NT_ST) && !defined(NT_LDST)
+int test_gcm128_std_stream_vectors(gcm_vector const *vector)
+{
+	struct gcm_key_data gkey;
+	struct gcm_context_data gctx;
+	int OK = 0;
+	// Temporary array for the calculated vectors
+	uint8_t *ct_test = NULL;
+	uint8_t *pt_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *T2_test = NULL;
+	uint64_t IV_alloc_len = 0;
+
+	// Allocate space for the calculated ciphertext
+	ct_test = malloc(vector->Plen);
+	// Allocate space for the plain text
+	pt_test = malloc(vector->Plen);
+	if ((ct_test == NULL) || (pt_test == NULL)) {
+		fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+		return 1;
+	}
+	IV_alloc_len = vector->IVlen;
+	// Allocate space for the IV
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate IV memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	T2_test = malloc(vector->Tlen);
+	if ((T_test == NULL) || (T2_test == NULL)) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	memset(gkey.expanded_keys, 0, sizeof(gkey.expanded_keys));
+	aes_gcm_pre_128(vector->K, &gkey);
+
+	////
+	// ISA-l Encrypt
+	////
+
+	aes_gcm_stream_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+			       IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+	// test of in-place encrypt
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_stream_enc_128(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+			       vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(pt_test, vector->C, vector->Plen,
+			 "ISA-L encrypted cypher text(in-place)");
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L encrypted tag T(in-place)");
+	memset(ct_test, 0, vector->Plen);
+	memset(T_test, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+	aes_gcm_stream_dec_128(&gkey, &gctx, pt_test, vector->C, vector->Plen,
+			       IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	// GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+	// test in in-place decrypt
+	memcpy(ct_test, vector->C, vector->Plen);
+	aes_gcm_stream_dec_128(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+			       vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+	OK |=
+	    check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+	// ISA-L enc -> ISA-L dec
+	aes_gcm_stream_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+			       IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	memset(pt_test, 0, vector->Plen);
+	aes_gcm_stream_dec_128(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+			       vector->A, vector->Alen, T2_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "ISA-L self decrypted plain text (P)");
+	OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+
+	memset(pt_test, 0, vector->Plen);
+
+	if (NULL != ct_test)
+		free(ct_test);
+	if (NULL != pt_test)
+		free(pt_test);
+	if (NULL != IV_c)
+		free(IV_c);
+	if (NULL != T_test)
+		free(T_test);
+	if (NULL != T2_test)
+		free(T2_test);
+
+	return OK;
+}
+
+void aes_gcm_stream_enc_256(const struct gcm_key_data *key_data,
+			    struct gcm_context_data *context,
+			    uint8_t * out,
+			    uint8_t const *in,
+			    uint64_t len,
+			    uint8_t * iv,
+			    uint8_t const *aad,
+			    uint64_t aad_len, uint8_t * auth_tag, uint64_t auth_tag_len)
+{
+	aes_gcm_init_256(key_data, context, iv, aad, aad_len);
+	uint8_t test_sequence[] = { 1, 12, 22, 0, 1, 12, 16 };	//sum(test_sequence) > max_Plen in vectors
+	uint32_t i;
+	uint32_t offset = 0, dist;
+
+	for (i = 0; i < sizeof(test_sequence); i++) {
+		dist = test_sequence[i];
+		if (offset + dist > len)
+			break;
+		aes_gcm_enc_256_update(key_data, context, out + offset, in + offset, dist);
+		offset += dist;
+	}
+
+	aes_gcm_enc_256_update(key_data, context, out + offset, in + offset, len - offset);
+	aes_gcm_enc_256_finalize(key_data, context, auth_tag, auth_tag_len);
+
+}
+
+void aes_gcm_stream_dec_256(const struct gcm_key_data *key_data,
+			    struct gcm_context_data *context,
+			    uint8_t * out,
+			    uint8_t const *in,
+			    uint64_t len,
+			    uint8_t * iv,
+			    uint8_t const *aad,
+			    uint64_t aad_len, uint8_t * auth_tag, uint64_t auth_tag_len)
+{
+	aes_gcm_init_256(key_data, context, iv, aad, aad_len);
+	uint8_t test_sequence[] = { 1, 12, 22, 0, 1, 12, 16 };	//sum(test_sequence) > max_Plen in vectors
+	uint32_t i;
+	uint32_t offset = 0, dist;
+
+	for (i = 0; i < sizeof(test_sequence); i++) {
+		dist = test_sequence[i];
+		if (offset + dist > len)
+			break;
+		aes_gcm_dec_256_update(key_data, context, out + offset, in + offset, dist);
+		offset += dist;
+	}
+
+	aes_gcm_dec_256_update(key_data, context, out + offset, in + offset, len - offset);
+	aes_gcm_dec_256_finalize(key_data, context, auth_tag, auth_tag_len);
+
+}
+
+int test_gcm256_std_stream_vectors(gcm_vector const *vector)
+{
+	struct gcm_key_data gkey;
+	struct gcm_context_data gctx;
+	int OK = 0;
+	// Temporary array for the calculated vectors
+	uint8_t *ct_test = NULL;
+	uint8_t *pt_test = NULL;
+	uint8_t *IV_c = NULL;
+	uint8_t *T_test = NULL;
+	uint8_t *T2_test = NULL;
+	uint64_t IV_alloc_len = 0;
+
+	// Allocate space for the calculated ciphertext
+	ct_test = malloc(vector->Plen);
+	// Allocate space for the plain text
+	pt_test = malloc(vector->Plen);
+	if ((ct_test == NULL) || (pt_test == NULL)) {
+		fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+		return 1;
+	}
+	IV_alloc_len = vector->IVlen;
+	// Allocate space for the IV
+	IV_c = malloc(IV_alloc_len);
+	if (IV_c == NULL) {
+		fprintf(stderr, "Can't allocate IV memory\n");
+		return 1;
+	}
+	memcpy(IV_c, vector->IV, vector->IVlen);
+
+	T_test = malloc(vector->Tlen);
+	T2_test = malloc(vector->Tlen);
+	if (T_test == NULL) {
+		fprintf(stderr, "Can't allocate tag memory\n");
+		return 1;
+	}
+	// This is only required once for a given key
+	aes_gcm_pre_256(vector->K, &gkey);
+
+	////
+	// ISA-l Encrypt
+	////
+	memset(ct_test, 0, vector->Plen);
+	aes_gcm_stream_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+			       IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+	// test of in-place encrypt
+	memcpy(pt_test, vector->P, vector->Plen);
+	aes_gcm_stream_enc_256(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+			       vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->C, vector->Plen,
+		       "ISA-L encrypted cypher text(in-place)");
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L encrypted tag T(in-place)");
+	memset(ct_test, 0, vector->Plen);
+	memset(T_test, 0, vector->Tlen);
+
+	////
+	// ISA-l Decrypt
+	////
+	aes_gcm_stream_dec_256(&gkey, &gctx, pt_test, vector->C, vector->Plen,
+			       IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+	// GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+	OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+	// test in in-place decrypt
+	memcpy(ct_test, vector->C, vector->Plen);
+	aes_gcm_stream_dec_256(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+			       vector->A, vector->Alen, T_test, vector->Tlen);
+	OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+	OK |=
+	    check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+	// ISA-L enc -> ISA-L dec
+	aes_gcm_stream_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+			       IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+	memset(pt_test, 0, vector->Plen);
+	aes_gcm_stream_dec_256(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+			       vector->A, vector->Alen, T2_test, vector->Tlen);
+	OK |=
+	    check_data(pt_test, vector->P, vector->Plen,
+		       "ISA-L self decrypted plain text (P)");
+	OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+
+	if (NULL != ct_test)
+		free(ct_test);
+	if (NULL != pt_test)
+		free(pt_test);
+	if (NULL != IV_c)
+		free(IV_c);
+	if (NULL != T_test)
+		free(T_test);
+	if (NULL != T2_test)
+		free(T2_test);
+
+	return OK;
+}
+#endif
+
+int test_gcm_std_vectors(void)
+{
+	int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]);
+	int vect;
+	int OK = 0;
+
+	printf("AES-GCM standard test vectors new api:\n");
+	for (vect = 0; (vect < vectors_cnt); vect++) {
+#ifdef DEBUG
+		printf("Standard vector new api %d/%d"
+		       "  Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+		       vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen,
+		       (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen,
+		       (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen);
+#else
+		printf(".");
+#endif
+		if (BITS_128 == gcm_vectors[vect].Klen)
+			OK |= test_gcm128_std_vectors(&gcm_vectors[vect]);
+		else
+			OK |= test_gcm256_std_vectors(&gcm_vectors[vect]);
+		if (0 != OK)
+			return OK;
+	}
+	printf("\n");
+	return OK;
+}
+
+#if !defined(NT_LD) && !defined(NT_ST) && !defined(NT_LDST)
+/**
+ * Stream API test with standard vectors
+ */
+int test_gcm_std_strm_vectors(void)
+{
+	int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]);
+	int vect;
+	int OK = 0;
+
+	printf("AES-GCM standard test vectors stream api:\n");
+	for (vect = 0; (vect < vectors_cnt); vect++) {
+#ifdef DEBUG
+		printf("Standard vector stream api %d/%d"
+		       "  Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+		       vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen,
+		       (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen,
+		       (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen);
+#else
+		printf(".");
+#endif
+		if (BITS_128 == gcm_vectors[vect].Klen)
+			OK |= test_gcm128_std_stream_vectors(&gcm_vectors[vect]);
+		else
+			OK |= test_gcm256_std_stream_vectors(&gcm_vectors[vect]);
+		if (0 != OK)
+			return OK;
+	}
+	printf("\n");
+	return OK;
+}
+#endif
+int main(int argc, char **argv)
+{
+	int errors = 0;
+	int seed;
+
+	if (argc == 1)
+		seed = TEST_SEED;
+	else
+		seed = atoi(argv[1]);
+
+	srand(seed);
+	printf("SEED: %d\n", seed);
+
+	errors += test_gcm_std_vectors();
+#if !defined(NT_LD) && !defined(NT_ST) && !defined(NT_LDST)
+	errors += test_gcm_std_strm_vectors();
+#endif
+
+	if (0 == errors)
+		printf("...Pass\n");
+	else
+		printf("...Fail\n");
+
+	return errors;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vaes_avx512.asm
new file mode 100644
index 000000000..dac7c5912
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vaes_avx512.asm
@@ -0,0 +1,4296 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+;       Erdinc Ozturk
+;       Vinodh Gopal
+;       James Guilford
+;       Tomasz Kantecki
+;
+;
+; References:
+;       This code was derived and highly optimized from the code described in paper:
+;               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;       The details of the implementation is explained in:
+;               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+;       0                   1                   2                   3
+;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                             Salt  (From the SA)               |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                     Initialization Vector                     |
+;       |         (This is the sequence number from IPSec header)       |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                              0x1                              |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+;       AAD will be padded with 0 to the next 16byte multiple
+;       for example, assume AAD is a u32 vector
+;
+;       if AAD is 8 bytes:
+;       AAD[3] = {A0, A1};
+;       padded AAD in xmm register = {A1 A0 0 0}
+;
+;       0                   1                   2                   3
+;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                               SPI (A1)                        |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                     32-bit Sequence Number (A0)               |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                              0x0                              |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;                                       AAD Format with 32-bit Sequence Number
+;
+;       if AAD is 12 bytes:
+;       AAD[3] = {A0, A1, A2};
+;       padded AAD in xmm register = {A2 A1 A0 0}
+;
+;       0                   1                   2                   3
+;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                               SPI (A2)                        |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                 64-bit Extended Sequence Number {A1,A0}       |
+;       |                                                               |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                              0x0                              |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;        AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+;       Must be a multiple of 4 bytes and from the definition of the spec.
+;       The code additionally supports any aadLen length.
+;
+; TLen:
+;       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "clear_regs.asm"
+%include "gcm_keys_vaes_avx512.asm"
+%include "gcm_defines.asm"
+%include "memcpy.asm"
+%include "aes_common.asm"
+
+%ifndef GCM128_MODE
+%ifndef GCM192_MODE
+%ifndef GCM256_MODE
+%error "No GCM mode selected for gcm_avx512.asm!"
+%endif
+%endif
+%endif
+
+%ifndef FUNCT_EXTENSION
+%define FUNCT_EXTENSION
+%endif
+
+;; Decide on AES-GCM key size to compile for
+%ifdef GCM128_MODE
+%define NROUNDS 9
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ vaes_avx512 %+ FUNCT_EXTENSION
+%endif
+
+%ifdef GCM192_MODE
+%define NROUNDS 11
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ vaes_avx512 %+ FUNCT_EXTENSION
+%endif
+
+%ifdef GCM256_MODE
+%define NROUNDS 13
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ vaes_avx512 %+ FUNCT_EXTENSION
+%endif
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+section .text
+default rel
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Stack frame definition
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, win64
+        %define XMM_STORAGE     (10*16)      ; space for 10 XMM registers
+        %define GP_STORAGE      ((9*8) + 24) ; space for 9 GP registers + 24 bytes for 64 byte alignment
+%else
+        %define XMM_STORAGE     0
+        %define GP_STORAGE      (8*8)   ; space for 7 GP registers + 1 for alignment
+%endif
+%ifdef GCM_BIG_DATA
+%define LOCAL_STORAGE           (128*16)   ; space for up to 128 AES blocks
+%else
+%define LOCAL_STORAGE           (48*16)   ; space for up to 48 AES blocks
+%endif
+
+;;; sequence is (bottom-up): GP, XMM, local
+%define STACK_GP_OFFSET         0
+%define STACK_XMM_OFFSET        (STACK_GP_OFFSET + GP_STORAGE)
+%define STACK_LOCAL_OFFSET      (STACK_XMM_OFFSET + XMM_STORAGE)
+%define STACK_FRAME_SIZE        (STACK_LOCAL_OFFSET + LOCAL_STORAGE)
+
+;; for compatibility with stack argument definitions in gcm_defines.asm
+%define STACK_OFFSET 0
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Horizontal XOR - 4 x 128bits xored together
+%macro VHPXORI4x128 2
+%define %%REG   %1      ; [in/out] ZMM with 4x128bits to xor; 128bit output
+%define %%TMP   %2      ; [clobbered] ZMM temporary register
+        vextracti64x4   YWORD(%%TMP), %%REG, 1
+        vpxorq          YWORD(%%REG), YWORD(%%REG), YWORD(%%TMP)
+        vextracti32x4   XWORD(%%TMP), YWORD(%%REG), 1
+        vpxorq          XWORD(%%REG), XWORD(%%REG), XWORD(%%TMP)
+%endmacro               ; VHPXORI4x128
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Horizontal XOR - 2 x 128bits xored together
+%macro VHPXORI2x128 2
+%define %%REG   %1      ; [in/out] YMM/ZMM with 2x128bits to xor; 128bit output
+%define %%TMP   %2      ; [clobbered] XMM/YMM/ZMM temporary register
+        vextracti32x4   XWORD(%%TMP), %%REG, 1
+        vpxorq          XWORD(%%REG), XWORD(%%REG), XWORD(%%TMP)
+%endmacro               ; VHPXORI2x128
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; schoolbook multiply - 1st step
+%macro VCLMUL_STEP1 6-7
+%define %%KP    %1      ; [in] key pointer
+%define %%HI    %2      ; [in] previous blocks 4 to 7
+%define %%TMP   %3      ; [clobbered] ZMM/YMM/XMM temporary
+%define %%TH    %4      ; [out] high product
+%define %%TM    %5      ; [out] medium product
+%define %%TL    %6      ; [out] low product
+%define %%HKEY  %7      ; [in/optional] hash key for multiplication
+
+%if %0 == 6
+        vmovdqu64       %%TMP, [%%KP + HashKey_4]
+%else
+        vmovdqa64       %%TMP, %%HKEY
+%endif
+        vpclmulqdq      %%TH, %%HI, %%TMP, 0x11     ; %%T5 = a1*b1
+        vpclmulqdq      %%TL, %%HI, %%TMP, 0x00     ; %%T7 = a0*b0
+        vpclmulqdq      %%TM, %%HI, %%TMP, 0x01     ; %%T6 = a1*b0
+        vpclmulqdq      %%TMP, %%HI, %%TMP, 0x10    ; %%T4 = a0*b1
+        vpxorq          %%TM, %%TM, %%TMP           ; [%%TH : %%TM : %%TL]
+%endmacro               ; VCLMUL_STEP1
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; schoolbook multiply - 2nd step
+%macro VCLMUL_STEP2 9-11
+%define %%KP    %1      ; [in] key pointer
+%define %%HI    %2      ; [out] ghash high 128 bits
+%define %%LO    %3      ; [in/out] cipher text blocks 0-3 (in); ghash low 128 bits (out)
+%define %%TMP0  %4      ; [clobbered] ZMM/YMM/XMM temporary
+%define %%TMP1  %5      ; [clobbered] ZMM/YMM/XMM temporary
+%define %%TMP2  %6      ; [clobbered] ZMM/YMM/XMM temporary
+%define %%TH    %7      ; [in] high product
+%define %%TM    %8      ; [in] medium product
+%define %%TL    %9      ; [in] low product
+%define %%HKEY  %10     ; [in/optional] hash key for multiplication
+%define %%HXOR  %11     ; [in/optional] type of horizontal xor (4 - 4x128; 2 - 2x128; 1 - none)
+
+%if %0 == 9
+        vmovdqu64       %%TMP0, [%%KP + HashKey_8]
+%else
+        vmovdqa64       %%TMP0, %%HKEY
+%endif
+        vpclmulqdq      %%TMP1, %%LO, %%TMP0, 0x10     ; %%TMP1 = a0*b1
+        vpclmulqdq      %%TMP2, %%LO, %%TMP0, 0x11     ; %%TMP2 = a1*b1
+        vpxorq          %%TH, %%TH, %%TMP2
+        vpclmulqdq      %%TMP2, %%LO, %%TMP0, 0x00     ; %%TMP2 = a0*b0
+        vpxorq          %%TL, %%TL, %%TMP2
+        vpclmulqdq      %%TMP0, %%LO, %%TMP0, 0x01     ; %%TMP0 = a1*b0
+        vpternlogq      %%TM, %%TMP1, %%TMP0, 0x96     ; %%TM = TM xor TMP1 xor TMP0
+
+        ;; finish multiplications
+        vpsrldq         %%TMP2, %%TM, 8
+        vpxorq          %%HI, %%TH, %%TMP2
+        vpslldq         %%TMP2, %%TM, 8
+        vpxorq          %%LO, %%TL, %%TMP2
+
+        ;; xor 128bit words horizontally and compute [(X8*H1) + (X7*H2) + ... ((X1+Y0)*H8]
+        ;; note: (X1+Y0) handled elsewhere
+%if %0 < 11
+        VHPXORI4x128    %%HI, %%TMP2
+        VHPXORI4x128    %%LO, %%TMP1
+%else
+%if %%HXOR == 4
+        VHPXORI4x128    %%HI, %%TMP2
+        VHPXORI4x128    %%LO, %%TMP1
+%elif %%HXOR == 2
+        VHPXORI2x128    %%HI, %%TMP2
+        VHPXORI2x128    %%LO, %%TMP1
+%endif                          ; HXOR
+        ;; for HXOR == 1 there is nothing to be done
+%endif                          ; !(%0 < 11)
+        ;; HIx holds top 128 bits
+        ;; LOx holds low 128 bits
+        ;; - further reductions to follow
+%endmacro               ; VCLMUL_STEP2
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; AVX512 reduction macro
+%macro VCLMUL_REDUCE 6
+%define %%OUT   %1      ; [out] zmm/ymm/xmm: result (must not be %%TMP1 or %%HI128)
+%define %%POLY  %2      ; [in] zmm/ymm/xmm: polynomial
+%define %%HI128 %3      ; [in] zmm/ymm/xmm: high 128b of hash to reduce
+%define %%LO128 %4      ; [in] zmm/ymm/xmm: low 128b of hash to reduce
+%define %%TMP0  %5      ; [in] zmm/ymm/xmm: temporary register
+%define %%TMP1  %6      ; [in] zmm/ymm/xmm: temporary register
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; first phase of the reduction
+        vpclmulqdq      %%TMP0, %%POLY, %%LO128, 0x01
+        vpslldq         %%TMP0, %%TMP0, 8       ; shift-L 2 DWs
+        vpxorq          %%TMP0, %%LO128, %%TMP0 ; first phase of the reduction complete
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; second phase of the reduction
+        vpclmulqdq      %%TMP1, %%POLY, %%TMP0, 0x00
+        vpsrldq         %%TMP1, %%TMP1, 4       ; shift-R only 1-DW to obtain 2-DWs shift-R
+
+        vpclmulqdq      %%OUT, %%POLY, %%TMP0, 0x10
+        vpslldq         %%OUT, %%OUT, 4         ; shift-L 1-DW to obtain result with no shifts
+
+        vpternlogq      %%OUT, %%TMP1, %%HI128, 0x96    ; OUT/GHASH = OUT xor TMP1 xor HI128
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endmacro
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; schoolbook multiply (1 to 8 blocks) - 1st step
+%macro VCLMUL_1_TO_8_STEP1 8
+%define %%KP      %1    ; [in] key pointer
+%define %%HI      %2    ; [in] ZMM ciphered blocks 4 to 7
+%define %%TMP1    %3    ; [clobbered] ZMM temporary
+%define %%TMP2    %4    ; [clobbered] ZMM temporary
+%define %%TH      %5    ; [out] ZMM high product
+%define %%TM      %6    ; [out] ZMM medium product
+%define %%TL      %7    ; [out] ZMM low product
+%define %%NBLOCKS %8    ; [in] number of blocks to ghash (0 to 8)
+
+%if %%NBLOCKS == 8
+        VCLMUL_STEP1    %%KP, %%HI, %%TMP1, %%TH, %%TM, %%TL
+%elif  %%NBLOCKS == 7
+        vmovdqu64       %%TMP2, [%%KP + HashKey_3]
+        vmovdqa64       %%TMP1, [rel mask_out_top_block]
+        vpandq          %%TMP2, %%TMP1
+        vpandq          %%HI, %%TMP1
+        VCLMUL_STEP1    NULL, %%HI, %%TMP1, %%TH, %%TM, %%TL, %%TMP2
+%elif  %%NBLOCKS == 6
+        vmovdqu64       YWORD(%%TMP2), [%%KP + HashKey_2]
+        VCLMUL_STEP1    NULL, YWORD(%%HI), YWORD(%%TMP1), \
+                YWORD(%%TH), YWORD(%%TM), YWORD(%%TL), YWORD(%%TMP2)
+%elif  %%NBLOCKS == 5
+        vmovdqu64       XWORD(%%TMP2), [%%KP + HashKey_1]
+        VCLMUL_STEP1    NULL, XWORD(%%HI), XWORD(%%TMP1), \
+                XWORD(%%TH), XWORD(%%TM), XWORD(%%TL), XWORD(%%TMP2)
+%else
+        vpxorq          %%TH, %%TH
+        vpxorq          %%TM, %%TM
+        vpxorq          %%TL, %%TL
+%endif
+%endmacro               ; VCLMUL_1_TO_8_STEP1
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; schoolbook multiply (1 to 8 blocks) - 2nd step
+%macro VCLMUL_1_TO_8_STEP2 10
+%define %%KP      %1    ; [in] key pointer
+%define %%HI      %2    ; [out] ZMM ghash high 128bits
+%define %%LO      %3    ; [in/out] ZMM ciphered blocks 0 to 3 (in); ghash low 128bits (out)
+%define %%TMP0    %4    ; [clobbered] ZMM temporary
+%define %%TMP1    %5    ; [clobbered] ZMM temporary
+%define %%TMP2    %6    ; [clobbered] ZMM temporary
+%define %%TH      %7    ; [in/clobbered] ZMM high sum
+%define %%TM      %8    ; [in/clobbered] ZMM medium sum
+%define %%TL      %9    ; [in/clobbered] ZMM low sum
+%define %%NBLOCKS %10   ; [in] number of blocks to ghash (0 to 8)
+
+%if %%NBLOCKS == 8
+        VCLMUL_STEP2    %%KP, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL
+%elif %%NBLOCKS == 7
+        vmovdqu64       %%TMP2, [%%KP + HashKey_7]
+        VCLMUL_STEP2    NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 6
+        vmovdqu64       %%TMP2, [%%KP + HashKey_6]
+        VCLMUL_STEP2    NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 5
+        vmovdqu64       %%TMP2, [%%KP + HashKey_5]
+        VCLMUL_STEP2    NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 4
+        vmovdqu64       %%TMP2, [%%KP + HashKey_4]
+        VCLMUL_STEP2    NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 3
+        vmovdqu64       %%TMP2, [%%KP + HashKey_3]
+        vmovdqa64       %%TMP1, [rel mask_out_top_block]
+        vpandq          %%TMP2, %%TMP1
+        vpandq          %%LO, %%TMP1
+        VCLMUL_STEP2    NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 2
+        vmovdqu64       YWORD(%%TMP2), [%%KP + HashKey_2]
+        VCLMUL_STEP2    NULL, YWORD(%%HI), YWORD(%%LO), \
+                YWORD(%%TMP0), YWORD(%%TMP1), YWORD(%%TMP2), \
+                YWORD(%%TH), YWORD(%%TM), YWORD(%%TL), YWORD(%%TMP2), 2
+%elif %%NBLOCKS == 1
+        vmovdqu64       XWORD(%%TMP2), [%%KP + HashKey_1]
+        VCLMUL_STEP2    NULL, XWORD(%%HI), XWORD(%%LO), \
+                XWORD(%%TMP0), XWORD(%%TMP1), XWORD(%%TMP2), \
+                XWORD(%%TH), XWORD(%%TM), XWORD(%%TL), XWORD(%%TMP2), 1
+%else
+        vpxorq          %%HI, %%HI
+        vpxorq          %%LO, %%LO
+%endif
+%endmacro               ; VCLMUL_1_TO_8_STEP2
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; GHASH 1 to 16 blocks of cipher text
+;;; - performs reduction at the end
+;;; - can take intermediate GHASH sums as input
+%macro  GHASH_1_TO_16 20
+%define %%KP            %1      ; [in] pointer to expanded keys
+%define %%GHASH         %2      ; [out] ghash output
+%define %%T1            %3      ; [clobbered] temporary ZMM
+%define %%T2            %4      ; [clobbered] temporary ZMM
+%define %%T3            %5      ; [clobbered] temporary ZMM
+%define %%T4            %6      ; [clobbered] temporary ZMM
+%define %%T5            %7      ; [clobbered] temporary ZMM
+%define %%T6            %8      ; [clobbered] temporary ZMM
+%define %%T7            %9      ; [clobbered] temporary ZMM
+%define %%T8            %10     ; [clobbered] temporary ZMM
+%define %%T9            %11     ; [clobbered] temporary ZMM
+%define %%GH            %12     ; [in/cloberred] ghash sum (high) or "no_zmm"
+%define %%GL            %13     ; [in/cloberred] ghash sum (low) or "no_zmm"
+%define %%GM            %14     ; [in/cloberred] ghash sum (medium) or "no_zmm"
+%define %%AAD_HASH_IN   %15     ; [in] input hash value
+%define %%CIPHER_IN0    %16     ; [in] ZMM with cipher text blocks 0-3
+%define %%CIPHER_IN1    %17     ; [in] ZMM with cipher text blocks 4-7
+%define %%CIPHER_IN2    %18     ; [in] ZMM with cipher text blocks 8-11
+%define %%CIPHER_IN3    %19     ; [in] ZMM with cipher text blocks 12-15
+%define %%NUM_BLOCKS    %20     ; [in] numerical value, number of blocks
+
+%define %%T0H           %%T1
+%define %%T0L           %%T2
+%define %%T0M1          %%T3
+%define %%T0M2          %%T4
+
+%define %%T1H           %%T5
+%define %%T1L           %%T6
+%define %%T1M1          %%T7
+%define %%T1M2          %%T8
+
+%define %%HK            %%T9
+
+%assign hashk       HashKey_ %+ %%NUM_BLOCKS
+%assign reg_idx     0
+%assign blocks_left %%NUM_BLOCKS
+
+        vpxorq          %%CIPHER_IN0, %%CIPHER_IN0, %%AAD_HASH_IN
+
+%assign first_result 1
+
+%ifnidn %%GH, no_zmm
+%ifnidn %%GM, no_zmm
+%ifnidn %%GL, no_zmm
+        ;; GHASH sums passed in to be updated and
+        ;; reduced at the end
+        vmovdqa64       %%T0H, %%GH
+        vmovdqa64       %%T0L, %%GL
+        vmovdqa64       %%T0M1, %%GM
+        vpxorq          %%T0M2, %%T0M2
+%assign first_result 0
+%endif
+%endif
+%endif
+
+%rep (blocks_left / 4)
+%xdefine %%REG_IN %%CIPHER_IN %+ reg_idx
+        vmovdqu64       %%HK, [%%KP + hashk]
+%if first_result == 1
+        vpclmulqdq      %%T0H, %%REG_IN, %%HK, 0x11      ; H = a1*b1
+        vpclmulqdq      %%T0L, %%REG_IN, %%HK, 0x00      ; L = a0*b0
+        vpclmulqdq      %%T0M1, %%REG_IN, %%HK, 0x01     ; M1 = a1*b0
+        vpclmulqdq      %%T0M2, %%REG_IN, %%HK, 0x10     ; TM2 = a0*b1
+%assign first_result 0
+%else
+        vpclmulqdq      %%T1H, %%REG_IN, %%HK, 0x11      ; H = a1*b1
+        vpclmulqdq      %%T1L, %%REG_IN, %%HK, 0x00      ; L = a0*b0
+        vpclmulqdq      %%T1M1, %%REG_IN, %%HK, 0x01     ; M1 = a1*b0
+        vpclmulqdq      %%T1M2, %%REG_IN, %%HK, 0x10     ; M2 = a0*b1
+        vpxorq          %%T0H, %%T0H, %%T1H
+        vpxorq          %%T0L, %%T0L, %%T1L
+        vpxorq          %%T0M1, %%T0M1, %%T1M1
+        vpxorq          %%T0M2, %%T0M2, %%T1M2
+%endif
+%undef %%REG_IN
+%assign reg_idx     (reg_idx + 1)
+%assign hashk       (hashk + 64)
+%assign blocks_left (blocks_left - 4)
+%endrep
+
+%if blocks_left > 0
+;; There are 1, 2 or 3 blocks left to process.
+;; It may also be that they are the only blocks to process.
+
+%xdefine %%REG_IN %%CIPHER_IN %+ reg_idx
+
+%if first_result == 1
+;; Case where %%NUM_BLOCKS = 1, 2 or 3
+%xdefine %%OUT_H  %%T0H
+%xdefine %%OUT_L  %%T0L
+%xdefine %%OUT_M1 %%T0M1
+%xdefine %%OUT_M2 %%T0M2
+%else
+%xdefine %%OUT_H  %%T1H
+%xdefine %%OUT_L  %%T1L
+%xdefine %%OUT_M1 %%T1M1
+%xdefine %%OUT_M2 %%T1M2
+%endif
+
+%if blocks_left == 1
+        vmovdqu64       XWORD(%%HK), [%%KP + hashk]
+        vpclmulqdq      XWORD(%%OUT_H), XWORD(%%REG_IN), XWORD(%%HK), 0x11      ; %%TH = a1*b1
+        vpclmulqdq      XWORD(%%OUT_L), XWORD(%%REG_IN), XWORD(%%HK), 0x00      ; %%TL = a0*b0
+        vpclmulqdq      XWORD(%%OUT_M1), XWORD(%%REG_IN), XWORD(%%HK), 0x01     ; %%TM1 = a1*b0
+        vpclmulqdq      XWORD(%%OUT_M2), XWORD(%%REG_IN), XWORD(%%HK), 0x10     ; %%TM2 = a0*b1
+%elif blocks_left == 2
+        vmovdqu64       YWORD(%%HK), [%%KP + hashk]
+        vpclmulqdq      YWORD(%%OUT_H), YWORD(%%REG_IN), YWORD(%%HK), 0x11      ; %%TH = a1*b1
+        vpclmulqdq      YWORD(%%OUT_L), YWORD(%%REG_IN), YWORD(%%HK), 0x00      ; %%TL = a0*b0
+        vpclmulqdq      YWORD(%%OUT_M1), YWORD(%%REG_IN), YWORD(%%HK), 0x01     ; %%TM1 = a1*b0
+        vpclmulqdq      YWORD(%%OUT_M2), YWORD(%%REG_IN), YWORD(%%HK), 0x10     ; %%TM2 = a0*b1
+%else ; blocks_left == 3
+        vmovdqu64       YWORD(%%HK), [%%KP + hashk]
+        vinserti64x2    %%HK, [%%KP + hashk + 32], 2
+        vpclmulqdq      %%OUT_H, %%REG_IN, %%HK, 0x11      ; %%TH = a1*b1
+        vpclmulqdq      %%OUT_L, %%REG_IN, %%HK, 0x00      ; %%TL = a0*b0
+        vpclmulqdq      %%OUT_M1, %%REG_IN, %%HK, 0x01     ; %%TM1 = a1*b0
+        vpclmulqdq      %%OUT_M2, %%REG_IN, %%HK, 0x10     ; %%TM2 = a0*b1
+%endif ; blocks_left
+
+%undef %%REG_IN
+%undef %%OUT_H
+%undef %%OUT_L
+%undef %%OUT_M1
+%undef %%OUT_M2
+
+%if first_result != 1
+        vpxorq          %%T0H, %%T0H, %%T1H
+        vpxorq          %%T0L, %%T0L, %%T1L
+        vpxorq          %%T0M1, %%T0M1, %%T1M1
+        vpxorq          %%T0M2, %%T0M2, %%T1M2
+%endif
+
+%endif ; blocks_left > 0
+
+        ;; integrate TM into TH and TL
+        vpxorq          %%T0M1, %%T0M1, %%T0M2
+        vpsrldq         %%T1M1, %%T0M1, 8
+        vpslldq         %%T1M2, %%T0M1, 8
+        vpxorq          %%T0H, %%T0H, %%T1M1
+        vpxorq          %%T0L, %%T0L, %%T1M2
+
+        ;; add TH and TL 128-bit words horizontally
+        VHPXORI4x128    %%T0H, %%T1M1
+        VHPXORI4x128    %%T0L, %%T1M2
+
+        ;; reduction
+        vmovdqa64       XWORD(%%HK), [rel POLY2]
+        VCLMUL_REDUCE   XWORD(%%GHASH), XWORD(%%HK), \
+                        XWORD(%%T0H), XWORD(%%T0L), XWORD(%%T0M1), XWORD(%%T0M2)
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+;;; Input: A and B (128-bits each, bit-reflected)
+;;; Output: C = A*B*x mod poly, (i.e. >>1 )
+;;; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+;;; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro  GHASH_MUL  7
+%define %%GH %1         ; 16 Bytes
+%define %%HK %2         ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+        vpclmulqdq      %%T1, %%GH, %%HK, 0x11  ; %%T1 = a1*b1
+        vpclmulqdq      %%T2, %%GH, %%HK, 0x00  ; %%T2 = a0*b0
+        vpclmulqdq      %%T3, %%GH, %%HK, 0x01  ; %%T3 = a1*b0
+        vpclmulqdq      %%GH, %%GH, %%HK, 0x10  ; %%GH = a0*b1
+        vpxorq          %%GH, %%GH, %%T3
+
+
+        vpsrldq         %%T3, %%GH, 8           ; shift-R %%GH 2 DWs
+        vpslldq         %%GH, %%GH, 8           ; shift-L %%GH 2 DWs
+
+        vpxorq          %%T1, %%T1, %%T3
+        vpxorq          %%GH, %%GH, %%T2
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;first phase of the reduction
+        vmovdqu64       %%T3, [rel POLY2]
+
+        vpclmulqdq      %%T2, %%T3, %%GH, 0x01
+        vpslldq         %%T2, %%T2, 8           ; shift-L %%T2 2 DWs
+
+        vpxorq          %%GH, %%GH, %%T2        ; first phase of the reduction complete
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;second phase of the reduction
+        vpclmulqdq      %%T2, %%T3, %%GH, 0x00
+        vpsrldq         %%T2, %%T2, 4           ; shift-R only 1-DW to obtain 2-DWs shift-R
+
+        vpclmulqdq      %%GH, %%T3, %%GH, 0x10
+        vpslldq         %%GH, %%GH, 4           ; Shift-L 1-DW to obtain result with no shifts
+
+        ; second phase of the reduction complete, the result is in %%GH
+        vpternlogq      %%GH, %%T1, %%T2, 0x96  ; GH = GH xor T1 xor T2
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx512
+;;; functions, but are kept to allow users to switch cpu architectures between calls
+;;; of pre, init, update, and finalize.
+%macro  PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK    %2
+%define %%T1    %3
+%define %%T2    %4
+%define %%T3    %5
+%define %%T4    %6
+%define %%T5    %7
+%define %%T6    %8
+
+        vmovdqa  %%T5, %%HK
+
+        ;; GHASH keys 2 to 48 or 128
+%ifdef GCM_BIG_DATA
+%assign max_hkey_idx 128
+%else
+%assign max_hkey_idx 48
+%endif
+
+%assign i 2
+%rep (max_hkey_idx - 1)
+        GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ;  %%T5 = HashKey^i<<1 mod poly
+        vmovdqu  [%%GDATA + HashKey_ %+ i], %%T5           ;  [HashKey_i] = %%T5
+%assign i (i + 1)
+%endrep
+
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; READ_SMALL_DATA_INPUT
+;;; Packs xmm register with data when data input is less or equal to 16 bytes
+;;; Returns 0 if data has length 0
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT    5
+%define %%OUTPUT        %1 ; [out] xmm register
+%define %%INPUT         %2 ; [in] buffer pointer to read from
+%define %%LENGTH        %3 ; [in] number of bytes to read
+%define %%TMP1          %4 ; [clobbered]
+%define %%MASK          %5 ; [out] k1 to k7 register to store the partial block mask
+
+        cmp             %%LENGTH, 16
+        jge             %%_read_small_data_ge16
+        lea             %%TMP1, [rel byte_len_to_mask_table]
+%ifidn __OUTPUT_FORMAT__, win64
+        add             %%TMP1, %%LENGTH
+        add             %%TMP1, %%LENGTH
+        kmovw           %%MASK, [%%TMP1]
+%else
+        kmovw           %%MASK, [%%TMP1 + %%LENGTH*2]
+%endif
+        vmovdqu8        %%OUTPUT{%%MASK}{z}, [%%INPUT]
+        jmp             %%_read_small_data_end
+%%_read_small_data_ge16:
+        VX512LDR        %%OUTPUT, [%%INPUT]
+        mov             %%TMP1, 0xffff
+        kmovq           %%MASK, %%TMP1
+%%_read_small_data_end:
+%endmacro ; READ_SMALL_DATA_INPUT
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro  CALC_AAD_HASH   18
+%define %%A_IN          %1      ; [in] AAD text pointer
+%define %%A_LEN         %2      ; [in] AAD length
+%define %%AAD_HASH      %3      ; [out] xmm ghash value
+%define %%GDATA_KEY     %4      ; [in] pointer to keys
+%define %%ZT0           %5      ; [clobbered] ZMM register
+%define %%ZT1           %6      ; [clobbered] ZMM register
+%define %%ZT2           %7      ; [clobbered] ZMM register
+%define %%ZT3           %8      ; [clobbered] ZMM register
+%define %%ZT4           %9      ; [clobbered] ZMM register
+%define %%ZT5           %10     ; [clobbered] ZMM register
+%define %%ZT6           %11     ; [clobbered] ZMM register
+%define %%ZT7           %12     ; [clobbered] ZMM register
+%define %%ZT8           %13     ; [clobbered] ZMM register
+%define %%ZT9           %14     ; [clobbered] ZMM register
+%define %%T1            %15     ; [clobbered] GP register
+%define %%T2            %16     ; [clobbered] GP register
+%define %%T3            %17     ; [clobbered] GP register
+%define %%MASKREG       %18     ; [clobbered] mask register
+
+%define %%SHFMSK %%ZT9
+%define %%POLY   %%ZT8
+%define %%TH     %%ZT7
+%define %%TM     %%ZT6
+%define %%TL     %%ZT5
+
+        mov             %%T1, %%A_IN            ; T1 = AAD
+        mov             %%T2, %%A_LEN           ; T2 = aadLen
+        vpxorq          %%AAD_HASH, %%AAD_HASH
+
+        vmovdqa64       %%SHFMSK, [rel SHUF_MASK]
+        vmovdqa64       %%POLY, [rel POLY2]
+
+%%_get_AAD_loop128:
+        cmp             %%T2, 128
+        jl              %%_exit_AAD_loop128
+
+        vmovdqu64       %%ZT2, [%%T1 + 64*0]  ; LO blocks (0-3)
+        vmovdqu64       %%ZT1, [%%T1 + 64*1]  ; HI blocks (4-7)
+        vpshufb         %%ZT2, %%SHFMSK
+        vpshufb         %%ZT1, %%SHFMSK
+
+        vpxorq          %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+
+        VCLMUL_STEP1    %%GDATA_KEY, %%ZT1, %%ZT0, %%TH, %%TM, %%TL
+        VCLMUL_STEP2    %%GDATA_KEY, %%ZT1, %%ZT2, %%ZT0, %%ZT3, %%ZT4, %%TH, %%TM, %%TL
+
+        ;; result in %%ZT1(H):%%ZT2(L)
+        ;; reduce and put the result in AAD_HASH
+        VCLMUL_REDUCE   %%AAD_HASH, XWORD(%%POLY), XWORD(%%ZT1), XWORD(%%ZT2), \
+                XWORD(%%ZT0), XWORD(%%ZT3)
+
+        sub             %%T2, 128
+        je              %%_CALC_AAD_done
+
+        add             %%T1, 128
+        jmp             %%_get_AAD_loop128
+
+%%_exit_AAD_loop128:
+        or              %%T2, %%T2
+        jz              %%_CALC_AAD_done
+
+        ;; prep mask source address
+        lea             %%T3, [rel byte64_len_to_mask_table]
+        lea             %%T3, [%%T3 + %%T2*8]
+
+        ;; calculate number of blocks to ghash (including partial bytes)
+        add             %%T2, 15
+        and             %%T2, -16       ; 1 to 8 blocks possible here
+        shr             %%T2, 4
+        cmp             %%T2, 7
+        je              %%_AAD_blocks_7
+        cmp             %%T2, 6
+        je              %%_AAD_blocks_6
+        cmp             %%T2, 5
+        je              %%_AAD_blocks_5
+        cmp             %%T2, 4
+        je              %%_AAD_blocks_4
+        cmp             %%T2, 3
+        je              %%_AAD_blocks_3
+        cmp             %%T2, 2
+        je              %%_AAD_blocks_2
+        cmp             %%T2, 1
+        je              %%_AAD_blocks_1
+        ;; fall through for 8 blocks
+
+        ;; The flow of each of these cases is identical:
+        ;; - load blocks plain text
+        ;; - shuffle loaded blocks
+        ;; - xor in current hash value into block 0
+        ;; - perform up multiplications with ghash keys
+        ;; - jump to reduction code
+%%_AAD_blocks_8:
+        sub             %%T3, (64 * 8)
+        kmovq           %%MASKREG, [%%T3]
+        vmovdqu8        %%ZT2, [%%T1 + 64*0]
+        vmovdqu8        %%ZT1{%%MASKREG}{z}, [%%T1 + 64*1]
+        vpshufb         %%ZT2, %%SHFMSK
+        vpshufb         %%ZT1, %%SHFMSK
+        vpxorq          %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) ; xor in current ghash
+        VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 8
+        VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+                %%ZT0, %%ZT3, %%ZT4, \
+                %%TH, %%TM, %%TL, 8
+        jmp             %%_AAD_blocks_done
+
+%%_AAD_blocks_7:
+        sub             %%T3, (64 * 8)
+        kmovq           %%MASKREG, [%%T3]
+        vmovdqu8        %%ZT2, [%%T1 + 64*0]
+        vmovdqu8        %%ZT1{%%MASKREG}{z}, [%%T1 + 64*1]
+        vpshufb         %%ZT2, %%SHFMSK
+        vpshufb         %%ZT1, %%SHFMSK
+        vpxorq          %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) ; xor in current ghash
+        VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 7
+        VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+                %%ZT0, %%ZT3, %%ZT4, \
+                %%TH, %%TM, %%TL, 7
+        jmp             %%_AAD_blocks_done
+
+%%_AAD_blocks_6:
+        sub             %%T3, (64 * 8)
+        kmovq           %%MASKREG, [%%T3]
+        vmovdqu8        %%ZT2, [%%T1 + 64*0]
+        vmovdqu8        YWORD(%%ZT1){%%MASKREG}{z}, [%%T1 + 64*1]
+        vpshufb         %%ZT2, %%SHFMSK
+        vpshufb         YWORD(%%ZT1), YWORD(%%SHFMSK)
+        vpxorq          %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+        VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 6
+        VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+                %%ZT0, %%ZT3, %%ZT4, \
+                %%TH, %%TM, %%TL, 6
+        jmp             %%_AAD_blocks_done
+
+%%_AAD_blocks_5:
+        sub             %%T3, (64 * 8)
+        kmovq           %%MASKREG, [%%T3]
+        vmovdqu8        %%ZT2, [%%T1 + 64*0]
+        vmovdqu8        XWORD(%%ZT1){%%MASKREG}{z}, [%%T1 + 64*1]
+        vpshufb         %%ZT2, %%SHFMSK
+        vpshufb         XWORD(%%ZT1), XWORD(%%SHFMSK)
+        vpxorq          %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+        VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 5
+        VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+                %%ZT0, %%ZT3, %%ZT4, \
+                %%TH, %%TM, %%TL, 5
+        jmp             %%_AAD_blocks_done
+
+%%_AAD_blocks_4:
+        kmovq           %%MASKREG, [%%T3]
+        vmovdqu8        %%ZT2{%%MASKREG}{z}, [%%T1 + 64*0]
+        vpshufb         %%ZT2, %%SHFMSK
+        vpxorq          %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+        VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 4
+        VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+                %%ZT0, %%ZT3, %%ZT4, \
+                %%TH, %%TM, %%TL, 4
+        jmp             %%_AAD_blocks_done
+
+%%_AAD_blocks_3:
+        kmovq           %%MASKREG, [%%T3]
+        vmovdqu8        %%ZT2{%%MASKREG}{z}, [%%T1 + 64*0]
+        vpshufb         %%ZT2, %%SHFMSK
+        vpxorq          %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+        VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 3
+        VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+                %%ZT0, %%ZT3, %%ZT4, \
+                %%TH, %%TM, %%TL, 3
+        jmp             %%_AAD_blocks_done
+
+%%_AAD_blocks_2:
+        kmovq           %%MASKREG, [%%T3]
+        vmovdqu8        YWORD(%%ZT2){%%MASKREG}{z}, [%%T1 + 64*0]
+        vpshufb         YWORD(%%ZT2), YWORD(%%SHFMSK)
+        vpxorq          %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+        VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 2
+        VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+                %%ZT0, %%ZT3, %%ZT4, \
+                %%TH, %%TM, %%TL, 2
+        jmp             %%_AAD_blocks_done
+
+%%_AAD_blocks_1:
+        kmovq           %%MASKREG, [%%T3]
+        vmovdqu8        XWORD(%%ZT2){%%MASKREG}{z}, [%%T1 + 64*0]
+        vpshufb         XWORD(%%ZT2), XWORD(%%SHFMSK)
+        vpxorq          %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+        VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 1
+        VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+                %%ZT0, %%ZT3, %%ZT4, \
+                %%TH, %%TM, %%TL, 1
+
+%%_AAD_blocks_done:
+        ;; Multiplications have been done. Do the reduction now
+        VCLMUL_REDUCE   %%AAD_HASH, XWORD(%%POLY), XWORD(%%ZT1), XWORD(%%ZT2), \
+                        XWORD(%%ZT0), XWORD(%%ZT3)
+%%_CALC_AAD_done:
+        ;; result in AAD_HASH
+
+%endmacro ; CALC_AAD_HASH
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; PARTIAL_BLOCK
+;;; Handles encryption/decryption and the tag partial blocks between
+;;; update calls.
+;;; Requires the input data be at least 1 byte long.
+;;; Output:
+;;; A cipher/plain of the first partial block (CYPH_PLAIN_OUT),
+;;; AAD_HASH and updated GDATA_CTX
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 22
+%define %%GDATA_KEY             %1 ; [in] key pointer
+%define %%GDATA_CTX             %2 ; [in] context pointer
+%define %%CYPH_PLAIN_OUT        %3 ; [in] output buffer
+%define %%PLAIN_CYPH_IN         %4 ; [in] input buffer
+%define %%PLAIN_CYPH_LEN        %5 ; [in] buffer length
+%define %%DATA_OFFSET           %6 ; [in/out] data offset (gets updated)
+%define %%AAD_HASH              %7 ; [out] updated GHASH value
+%define %%ENC_DEC               %8 ; [in] cipher direction
+%define %%GPTMP0                %9 ; [clobbered] GP temporary register
+%define %%GPTMP1                %10 ; [clobbered] GP temporary register
+%define %%GPTMP2                %11 ; [clobbered] GP temporary register
+%define %%ZTMP0                 %12 ; [clobbered] ZMM temporary register
+%define %%ZTMP1                 %13 ; [clobbered] ZMM temporary register
+%define %%ZTMP2                 %14 ; [clobbered] ZMM temporary register
+%define %%ZTMP3                 %15 ; [clobbered] ZMM temporary register
+%define %%ZTMP4                 %16 ; [clobbered] ZMM temporary register
+%define %%ZTMP5                 %17 ; [clobbered] ZMM temporary register
+%define %%ZTMP6                 %18 ; [clobbered] ZMM temporary register
+%define %%ZTMP7                 %19 ; [clobbered] ZMM temporary register
+%define %%ZTMP8                 %20 ; [clobbered] ZMM temporary register
+%define %%ZTMP9                 %21 ; [clobbered] ZMM temporary register
+%define %%MASKREG               %22 ; [clobbered] mask temporary register
+
+%define %%XTMP0 XWORD(%%ZTMP0)
+%define %%XTMP1 XWORD(%%ZTMP1)
+%define %%XTMP2 XWORD(%%ZTMP2)
+%define %%XTMP3 XWORD(%%ZTMP3)
+%define %%XTMP4 XWORD(%%ZTMP4)
+%define %%XTMP5 XWORD(%%ZTMP5)
+%define %%XTMP6 XWORD(%%ZTMP6)
+%define %%XTMP7 XWORD(%%ZTMP7)
+%define %%XTMP8 XWORD(%%ZTMP8)
+%define %%XTMP9 XWORD(%%ZTMP9)
+
+%define %%LENGTH        %%GPTMP0
+%define %%IA0           %%GPTMP1
+%define %%IA1           %%GPTMP2
+
+        mov             %%LENGTH, [%%GDATA_CTX + PBlockLen]
+        or              %%LENGTH, %%LENGTH
+        je              %%_partial_block_done           ;Leave Macro if no partial blocks
+
+        READ_SMALL_DATA_INPUT   %%XTMP0, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%IA0, %%MASKREG
+
+        ;; XTMP1 = my_ctx_data.partial_block_enc_key
+        vmovdqu64       %%XTMP1, [%%GDATA_CTX + PBlockEncKey]
+        vmovdqu64       %%XTMP2, [%%GDATA_KEY + HashKey]
+
+        ;; adjust the shuffle mask pointer to be able to shift right %%LENGTH bytes
+        ;; (16 - %%LENGTH) is the number of bytes in plaintext mod 16)
+        lea             %%IA0, [rel SHIFT_MASK]
+        add             %%IA0, %%LENGTH
+        vmovdqu64       %%XTMP3, [%%IA0]   ; shift right shuffle mask
+        vpshufb         %%XTMP1, %%XTMP3
+
+%ifidn  %%ENC_DEC, DEC
+        ;;  keep copy of cipher text in %%XTMP4
+        vmovdqa64       %%XTMP4, %%XTMP0
+%endif
+        vpxorq          %%XTMP1, %%XTMP0      ; Cyphertext XOR E(K, Yn)
+
+        ;; Set %%IA1 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+        ;; Determine if partial block is not being filled and shift mask accordingly
+        mov             %%IA1, %%PLAIN_CYPH_LEN
+        add             %%IA1, %%LENGTH
+        sub             %%IA1, 16
+        jge             %%_no_extra_mask
+        sub             %%IA0, %%IA1
+%%_no_extra_mask:
+        ;; get the appropriate mask to mask out bottom %%LENGTH bytes of %%XTMP1
+        ;; - mask out bottom %%LENGTH bytes of %%XTMP1
+        vmovdqu64       %%XTMP0, [%%IA0 + ALL_F - SHIFT_MASK]
+        vpand           %%XTMP1, %%XTMP0
+
+%ifidn  %%ENC_DEC, DEC
+        vpand           %%XTMP4, %%XTMP0
+        vpshufb         %%XTMP4, [rel SHUF_MASK]
+        vpshufb         %%XTMP4, %%XTMP3
+        vpxorq          %%AAD_HASH, %%XTMP4
+%else
+        vpshufb         %%XTMP1, [rel SHUF_MASK]
+        vpshufb         %%XTMP1, %%XTMP3
+        vpxorq          %%AAD_HASH, %%XTMP1
+%endif
+        cmp             %%IA1, 0
+        jl              %%_partial_incomplete
+
+        ;; GHASH computation for the last <16 Byte block
+        GHASH_MUL       %%AAD_HASH, %%XTMP2, %%XTMP5, %%XTMP6, %%XTMP7, %%XTMP8, %%XTMP9
+
+        mov             qword [%%GDATA_CTX + PBlockLen], 0
+
+        ;;  Set %%IA1 to be the number of bytes to write out
+        mov             %%IA0, %%LENGTH
+        mov             %%LENGTH, 16
+        sub             %%LENGTH, %%IA0
+        jmp             %%_enc_dec_done
+
+%%_partial_incomplete:
+%ifidn __OUTPUT_FORMAT__, win64
+        mov             %%IA0, %%PLAIN_CYPH_LEN
+        add             [%%GDATA_CTX + PBlockLen], %%IA0
+%else
+        add             [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%endif
+        mov             %%LENGTH, %%PLAIN_CYPH_LEN
+
+%%_enc_dec_done:
+        ;; output encrypted Bytes
+
+        lea             %%IA0, [rel byte_len_to_mask_table]
+        kmovw           %%MASKREG, [%%IA0 + %%LENGTH*2]
+        vmovdqu64       [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+%ifidn  %%ENC_DEC, ENC
+        ;; shuffle XTMP1 back to output as ciphertext
+        vpshufb         %%XTMP1, [rel SHUF_MASK]
+        vpshufb         %%XTMP1, %%XTMP3
+%endif
+        vmovdqu8        [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{%%MASKREG}, %%XTMP1
+        add             %%DATA_OFFSET, %%LENGTH
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+%macro GHASH_SINGLE_MUL 9
+%define %%GDATA                 %1
+%define %%HASHKEY               %2
+%define %%CIPHER                %3
+%define %%STATE_11              %4
+%define %%STATE_00              %5
+%define %%STATE_MID             %6
+%define %%T1                    %7
+%define %%T2                    %8
+%define %%FIRST                 %9
+
+        vmovdqu         %%T1, [%%GDATA + %%HASHKEY]
+%ifidn %%FIRST, first
+        vpclmulqdq      %%STATE_11, %%CIPHER, %%T1, 0x11         ; %%T4 = a1*b1
+        vpclmulqdq      %%STATE_00, %%CIPHER, %%T1, 0x00         ; %%T4_2 = a0*b0
+        vpclmulqdq      %%STATE_MID, %%CIPHER, %%T1, 0x01        ; %%T6 = a1*b0
+        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x10               ; %%T5 = a0*b1
+        vpxor           %%STATE_MID, %%STATE_MID, %%T2
+%else
+        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x11
+        vpxor           %%STATE_11, %%STATE_11, %%T2
+
+        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x00
+        vpxor           %%STATE_00, %%STATE_00, %%T2
+
+        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x01
+        vpxor           %%STATE_MID, %%STATE_MID, %%T2
+
+        vpclmulqdq      %%T2, %%CIPHER, %%T1, 0x10
+        vpxor           %%STATE_MID, %%STATE_MID, %%T2
+%endif
+
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; This macro is used to "warm-up" pipeline for GHASH_8_ENCRYPT_8_PARALLEL
+;;; macro code. It is called only for data lenghts 128 and above.
+;;; The flow is as follows:
+;;; - encrypt the initial %%num_initial_blocks blocks (can be 0)
+;;; - encrypt the next 8 blocks and stitch with
+;;;   GHASH for the first %%num_initial_blocks
+;;;   - the last 8th block can be partial (lengths between 129 and 239)
+;;;   - partial block ciphering is handled within this macro
+;;;     - top bytes of such block are cleared for
+;;;       the subsequent GHASH calculations
+;;;   - PBlockEncKey needs to be setup in case of multi-call
+;;;     - top bytes of the block need to include encrypted counter block so that
+;;;       when handling partial block case text is read and XOR'ed against it.
+;;;       This needs to be in un-shuffled format.
+
+%macro INITIAL_BLOCKS 26-27
+%define %%GDATA_KEY             %1      ; [in] pointer to GCM keys
+%define %%GDATA_CTX             %2      ; [in] pointer to GCM context
+%define %%CYPH_PLAIN_OUT        %3      ; [in] output buffer
+%define %%PLAIN_CYPH_IN         %4      ; [in] input buffer
+%define %%LENGTH                %5      ; [in/out] number of bytes to process
+%define %%DATA_OFFSET           %6      ; [in/out] data offset
+%define %%num_initial_blocks    %7      ; [in] can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%CTR                   %8      ; [in/out] XMM counter block
+%define %%AAD_HASH              %9      ; [in/out] ZMM with AAD hash
+%define %%ZT1                   %10     ; [out] ZMM cipher blocks 0-3 for GHASH
+%define %%ZT2                   %11     ; [out] ZMM cipher blocks 4-7 for GHASH
+%define %%ZT3                   %12     ; [clobbered] ZMM temporary
+%define %%ZT4                   %13     ; [clobbered] ZMM temporary
+%define %%ZT5                   %14     ; [clobbered] ZMM temporary
+%define %%ZT6                   %15     ; [clobbered] ZMM temporary
+%define %%ZT7                   %16     ; [clobbered] ZMM temporary
+%define %%ZT8                   %17     ; [clobbered] ZMM temporary
+%define %%ZT9                   %18     ; [clobbered] ZMM temporary
+%define %%ZT10                  %19     ; [clobbered] ZMM temporary
+%define %%ZT11                  %20     ; [clobbered] ZMM temporary
+%define %%ZT12                  %21     ; [clobbered] ZMM temporary
+%define %%IA0                   %22     ; [clobbered] GP temporary
+%define %%IA1                   %23     ; [clobbered] GP temporary
+%define %%ENC_DEC               %24     ; [in] ENC/DEC selector
+%define %%MASKREG               %25     ; [clobbered] mask register
+%define %%SHUFMASK              %26     ; [in] ZMM with BE/LE shuffle mask
+%define %%PARTIAL_PRESENT       %27     ; [in] "no_partial_block" option can be passed here (if length is guaranteed to be > 15*16 bytes)
+
+%define %%T1 XWORD(%%ZT1)
+%define %%T2 XWORD(%%ZT2)
+%define %%T3 XWORD(%%ZT3)
+%define %%T4 XWORD(%%ZT4)
+%define %%T5 XWORD(%%ZT5)
+%define %%T6 XWORD(%%ZT6)
+%define %%T7 XWORD(%%ZT7)
+%define %%T8 XWORD(%%ZT8)
+%define %%T9 XWORD(%%ZT9)
+
+%define %%TH %%ZT10
+%define %%TM %%ZT11
+%define %%TL %%ZT12
+
+;; determine if partial block code needs to be added
+%assign partial_block_possible 1
+%if %0 > 26
+%ifidn %%PARTIAL_PRESENT, no_partial_block
+%assign partial_block_possible 0
+%endif
+%endif
+
+%if %%num_initial_blocks > 0
+        ;; prepare AES counter blocks
+%if %%num_initial_blocks == 1
+        vpaddd          %%T3, %%CTR, [rel ONE]
+%elif %%num_initial_blocks == 2
+        vshufi64x2      YWORD(%%ZT3), YWORD(%%CTR), YWORD(%%CTR), 0
+        vpaddd          YWORD(%%ZT3), YWORD(%%ZT3), [rel ddq_add_1234]
+%else
+        vshufi64x2      ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+        vpaddd          %%ZT3, ZWORD(%%CTR), [rel ddq_add_1234]
+        vpaddd          %%ZT4, ZWORD(%%CTR), [rel ddq_add_5678]
+%endif
+
+        ;; extract new counter value (%%T3)
+        ;; shuffle the counters for AES rounds
+%if %%num_initial_blocks <= 4
+        vextracti32x4   %%CTR, %%ZT3, (%%num_initial_blocks - 1)
+%else
+        vextracti32x4   %%CTR, %%ZT4, (%%num_initial_blocks - 5)
+%endif
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+                        %%ZT3, %%ZT4, no_zmm, no_zmm, \
+                        %%ZT3, %%ZT4, no_zmm, no_zmm, \
+                        %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+
+        ;; load plain/cipher text
+        ZMM_LOAD_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \
+                        %%ZT5, %%ZT6, no_zmm, no_zmm
+
+        ;; AES rounds and XOR with plain/cipher text
+%assign j 0
+%rep (NROUNDS + 2)
+        vbroadcastf64x2 %%ZT1, [%%GDATA_KEY + (j * 16)]
+        ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+                        %%ZT1, j, \
+                        %%ZT5, %%ZT6, no_zmm, no_zmm, \
+                        %%num_initial_blocks, NROUNDS
+%assign j (j + 1)
+%endrep
+
+        ;; write cipher/plain text back to output and
+        ;; zero bytes outside the mask before hashing
+        ZMM_STORE_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \
+                        %%ZT3, %%ZT4, no_zmm, no_zmm
+
+        ;; Shuffle the cipher text blocks for hashing part
+        ;; ZT5 and ZT6 are expected outputs with blocks for hashing
+%ifidn  %%ENC_DEC, DEC
+        ;; Decrypt case
+        ;; - cipher blocks are in ZT5 & ZT6
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+                        %%ZT5, %%ZT6, no_zmm, no_zmm, \
+                        %%ZT5, %%ZT6, no_zmm, no_zmm, \
+                        %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+%else
+        ;; Encrypt case
+        ;; - cipher blocks are in ZT3 & ZT4
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+                        %%ZT5, %%ZT6, no_zmm, no_zmm, \
+                        %%ZT3, %%ZT4, no_zmm, no_zmm, \
+                        %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+%endif                          ; Encrypt
+
+        ;; adjust data offset and length
+        sub             %%LENGTH, (%%num_initial_blocks * 16)
+        add             %%DATA_OFFSET, (%%num_initial_blocks * 16)
+
+        ;; At this stage
+        ;; - ZT5:ZT6 include cipher blocks to be GHASH'ed
+
+%endif                          ;  %%num_initial_blocks > 0
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; - cipher of %%num_initial_blocks is done
+        ;; - prepare counter blocks for the next 8 blocks (ZT3 & ZT4)
+        ;;   - save the last block in %%CTR
+        ;;   - shuffle the blocks for AES
+        ;; - stitch encryption of the new blocks with
+        ;;   GHASHING the previous blocks
+        vshufi64x2      ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+        vpaddd          %%ZT3, ZWORD(%%CTR), [rel ddq_add_1234]
+        vpaddd          %%ZT4, ZWORD(%%CTR), [rel ddq_add_5678]
+        vextracti32x4   %%CTR, %%ZT4, 3
+
+        vpshufb         %%ZT3, %%SHUFMASK
+        vpshufb         %%ZT4, %%SHUFMASK
+
+%if partial_block_possible != 0
+        ;; get text load/store mask (assume full mask by default)
+        mov             %%IA0, 0xffff_ffff_ffff_ffff
+%if %%num_initial_blocks > 0
+        ;; NOTE: 'jge' is always taken for %%num_initial_blocks = 0
+        ;;      This macro is executed for lenght 128 and up,
+        ;;      zero length is checked in GCM_ENC_DEC.
+        ;; We know there is partial block if:
+        ;;      LENGTH - 16*num_initial_blocks < 128
+        cmp             %%LENGTH, 128
+        jge             %%_initial_partial_block_continue
+        mov             %%IA1, rcx
+        mov             rcx, 128
+        sub             rcx, %%LENGTH
+        shr             %%IA0, cl
+        mov             rcx, %%IA1
+%%_initial_partial_block_continue:
+%endif
+        kmovq           %%MASKREG, %%IA0
+        ;; load plain or cipher text (masked)
+        ZMM_LOAD_MASKED_BLOCKS_0_16 8, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \
+                        %%ZT1, %%ZT2, no_zmm, no_zmm, %%MASKREG
+%else
+        ;; load plain or cipher text
+        ZMM_LOAD_BLOCKS_0_16 8, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \
+                        %%ZT1, %%ZT2, no_zmm, no_zmm
+%endif  ;;  partial_block_possible
+
+        ;; === AES ROUND 0
+%assign aes_round 0
+        vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)]
+        ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+                        %%ZT8, aes_round, \
+                        %%ZT1, %%ZT2, no_zmm, no_zmm, \
+                        8, NROUNDS
+%assign aes_round (aes_round + 1)
+
+        ;; ===  GHASH blocks 4-7
+%if (%%num_initial_blocks > 0)
+        ;; Hash in AES state
+        vpxorq          %%ZT5, %%ZT5, %%AAD_HASH
+
+        VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT6, %%ZT8, %%ZT9, \
+                        %%TH, %%TM, %%TL, %%num_initial_blocks
+%endif
+
+        ;; === [1/3] of AES rounds
+
+%rep ((NROUNDS + 1) / 3)
+        vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)]
+        ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+                        %%ZT8, aes_round, \
+                        %%ZT1, %%ZT2, no_zmm, no_zmm, \
+                        8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep                         ; %rep ((NROUNDS + 1) / 2)
+
+        ;; ===  GHASH blocks 0-3 and gather
+%if (%%num_initial_blocks > 0)
+        VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT6, %%ZT5, \
+                %%ZT7, %%ZT8, %%ZT9, \
+                %%TH, %%TM, %%TL, %%num_initial_blocks
+%endif
+
+        ;; === [2/3] of AES rounds
+
+%rep ((NROUNDS + 1) / 3)
+        vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)]
+        ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+                        %%ZT8, aes_round, \
+                        %%ZT1, %%ZT2, no_zmm, no_zmm, \
+                        8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep                         ; %rep ((NROUNDS + 1) / 2)
+
+        ;; ===  GHASH reduction
+
+%if (%%num_initial_blocks > 0)
+        ;; [out] AAD_HASH - hash output
+        ;; [in]  T8 - polynomial
+        ;; [in]  T6 - high, T5 - low
+        ;; [clobbered] T9, T7 - temporary
+        vmovdqu64       %%T8, [rel POLY2]
+        VCLMUL_REDUCE   XWORD(%%AAD_HASH), %%T8, %%T6, %%T5, %%T7, %%T9
+%endif
+
+        ;; === [3/3] of AES rounds
+
+%rep (((NROUNDS + 1) / 3) + 2)
+%if aes_round < (NROUNDS + 2)
+        vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)]
+        ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+                        %%ZT8, aes_round, \
+                        %%ZT1, %%ZT2, no_zmm, no_zmm, \
+                        8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endif
+%endrep                         ; %rep ((NROUNDS + 1) / 2)
+
+%if partial_block_possible != 0
+        ;; write cipher/plain text back to output and
+        ;; zero bytes outside the mask before hashing
+        ZMM_STORE_MASKED_BLOCKS_0_16 8, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \
+                        %%ZT3, %%ZT4, no_zmm, no_zmm, %%MASKREG
+        ;; check if there is partial block
+        cmp             %%LENGTH, 128
+        jl              %%_initial_save_partial
+        ;; adjust offset and length
+        add             %%DATA_OFFSET, 128
+        sub             %%LENGTH, 128
+        jmp             %%_initial_blocks_done
+%%_initial_save_partial:
+        ;; partial block case
+        ;; - save the partial block in unshuffled format
+        ;;   - ZT4 is partially XOR'ed with data and top bytes contain
+        ;;     encrypted counter block only
+        ;; - save number of bytes process in the partial block
+        ;; - adjust offset and zero the length
+        ;; - clear top bytes of the partial block for subsequent GHASH calculations
+        vextracti32x4   [%%GDATA_CTX + PBlockEncKey], %%ZT4, 3
+        add             %%DATA_OFFSET, %%LENGTH
+        sub             %%LENGTH, (128 - 16)
+        mov             [%%GDATA_CTX + PBlockLen], %%LENGTH
+        xor             %%LENGTH, %%LENGTH
+        vmovdqu8        %%ZT4{%%MASKREG}{z}, %%ZT4
+%%_initial_blocks_done:
+%else
+        ZMM_STORE_BLOCKS_0_16 8, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \
+                        %%ZT3, %%ZT4, no_zmm, no_zmm
+        add             %%DATA_OFFSET, 128
+        sub             %%LENGTH, 128
+%endif  ;; partial_block_possible
+
+        ;; Shuffle AES result for GHASH.
+%ifidn  %%ENC_DEC, DEC
+        ;; Decrypt case
+        ;; - cipher blocks are in ZT1 & ZT2
+        vpshufb         %%ZT1, %%SHUFMASK
+        vpshufb         %%ZT2, %%SHUFMASK
+%else
+        ;; Encrypt case
+        ;; - cipher blocks are in ZT3 & ZT4
+        vpshufb         %%ZT1, %%ZT3, %%SHUFMASK
+        vpshufb         %%ZT2, %%ZT4, %%SHUFMASK
+%endif                          ; Encrypt
+
+        ;; Current hash value is in AAD_HASH
+
+        ;; Combine GHASHed value with the corresponding ciphertext
+        vpxorq          %%ZT1, %%ZT1, %%AAD_HASH
+
+%endmacro                       ; INITIAL_BLOCKS
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block.
+;;; It may look similar to INITIAL_BLOCKS but its usage is different:
+;;; - first encrypts/decrypts required number of blocks and then
+;;;   ghashes these blocks
+;;; - Small packets or left over data chunks (<256 bytes)
+;;;     - single or multi call
+;;; - Remaining data chunks below 256 bytes (multi buffer code)
+;;;
+;;; num_initial_blocks is expected to include the partial final block
+;;; in the count.
+%macro INITIAL_BLOCKS_PARTIAL 41
+%define %%GDATA_KEY             %1  ; [in] key pointer
+%define %%GDATA_CTX             %2  ; [in] context pointer
+%define %%CYPH_PLAIN_OUT        %3  ; [in] text out pointer
+%define %%PLAIN_CYPH_IN         %4  ; [in] text out pointer
+%define %%LENGTH                %5  ; [in/clobbered] length in bytes
+%define %%DATA_OFFSET           %6  ; [in/out] current data offset (updated)
+%define %%num_initial_blocks    %7  ; [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
+%define %%CTR                   %8  ; [in/out] current counter value
+%define %%HASH_IN_OUT           %9  ; [in/out] XMM ghash in/out value
+%define %%ENC_DEC               %10 ; [in] cipher direction (ENC/DEC)
+%define %%INSTANCE_TYPE         %11 ; [in] multi_call or single_call
+%define %%ZT0                   %12 ; [clobbered] ZMM temporary
+%define %%ZT1                   %13 ; [clobbered] ZMM temporary
+%define %%ZT2                   %14 ; [clobbered] ZMM temporary
+%define %%ZT3                   %15 ; [clobbered] ZMM temporary
+%define %%ZT4                   %16 ; [clobbered] ZMM temporary
+%define %%ZT5                   %17 ; [clobbered] ZMM temporary
+%define %%ZT6                   %18 ; [clobbered] ZMM temporary
+%define %%ZT7                   %19 ; [clobbered] ZMM temporary
+%define %%ZT8                   %20 ; [clobbered] ZMM temporary
+%define %%ZT9                   %21 ; [clobbered] ZMM temporary
+%define %%ZT10                  %22 ; [clobbered] ZMM temporary
+%define %%ZT11                  %23 ; [clobbered] ZMM temporary
+%define %%ZT12                  %24 ; [clobbered] ZMM temporary
+%define %%ZT13                  %25 ; [clobbered] ZMM temporary
+%define %%ZT14                  %26 ; [clobbered] ZMM temporary
+%define %%ZT15                  %27 ; [clobbered] ZMM temporary
+%define %%ZT16                  %28 ; [clobbered] ZMM temporary
+%define %%ZT17                  %29 ; [clobbered] ZMM temporary
+%define %%ZT18                  %30 ; [clobbered] ZMM temporary
+%define %%ZT19                  %31 ; [clobbered] ZMM temporary
+%define %%ZT20                  %32 ; [clobbered] ZMM temporary
+%define %%ZT21                  %33 ; [clobbered] ZMM temporary
+%define %%ZT22                  %34 ; [clobbered] ZMM temporary
+%define %%GH                    %35 ; [in] ZMM ghash sum (high)
+%define %%GL                    %36 ; [in] ZMM ghash sum (low)
+%define %%GM                    %37 ; [in] ZMM ghash sum (middle)
+%define %%IA0                   %38 ; [clobbered] GP temporary
+%define %%IA1                   %39 ; [clobbered] GP temporary
+%define %%MASKREG               %40 ; [clobbered] mask register
+%define %%SHUFMASK              %41 ; [in] ZMM with BE/LE shuffle mask
+
+%define %%T1 XWORD(%%ZT1)
+%define %%T2 XWORD(%%ZT2)
+%define %%T7 XWORD(%%ZT7)
+
+%define %%CTR0 %%ZT3
+%define %%CTR1 %%ZT4
+%define %%CTR2 %%ZT8
+%define %%CTR3 %%ZT9
+
+%define %%DAT0 %%ZT5
+%define %%DAT1 %%ZT6
+%define %%DAT2 %%ZT10
+%define %%DAT3 %%ZT11
+
+%ifnidn %%GH, no_zmm
+%ifnidn %%GL, no_zmm
+%ifnidn %%GM, no_zmm
+        ;; when temporary sums are passed then zero HASH IN value
+        ;; - whatever it holds it is invalid in this case
+        vpxorq          %%HASH_IN_OUT, %%HASH_IN_OUT
+%endif
+%endif
+%endif
+        ;; Copy ghash to temp reg
+        vmovdqa64       %%T2, %%HASH_IN_OUT
+
+        ;; prepare AES counter blocks
+%if %%num_initial_blocks == 1
+        vpaddd          XWORD(%%CTR0), %%CTR, [rel ONE]
+%elif %%num_initial_blocks == 2
+        vshufi64x2      YWORD(%%CTR0), YWORD(%%CTR), YWORD(%%CTR), 0
+        vpaddd          YWORD(%%CTR0), YWORD(%%CTR0), [rel ddq_add_1234]
+%else
+        vshufi64x2      ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+        vpaddd          %%CTR0, ZWORD(%%CTR), [rel ddq_add_1234]
+%if %%num_initial_blocks > 4
+        vpaddd          %%CTR1, ZWORD(%%CTR), [rel ddq_add_5678]
+%endif
+%if %%num_initial_blocks > 8
+        vpaddd          %%CTR2, %%CTR0, [rel ddq_add_8888]
+%endif
+%if %%num_initial_blocks > 12
+        vpaddd          %%CTR3, %%CTR1, [rel ddq_add_8888]
+%endif
+%endif
+
+        ;; get load/store mask
+        lea             %%IA0, [rel byte64_len_to_mask_table]
+        mov             %%IA1, %%LENGTH
+%if %%num_initial_blocks > 12
+        sub             %%IA1, 3 * 64
+%elif %%num_initial_blocks > 8
+        sub             %%IA1, 2 * 64
+%elif %%num_initial_blocks > 4
+        sub             %%IA1, 64
+%endif
+        kmovq           %%MASKREG, [%%IA0 + %%IA1*8]
+
+        ;; extract new counter value
+        ;; shuffle the counters for AES rounds
+%if %%num_initial_blocks <= 4
+        vextracti32x4   %%CTR, %%CTR0, (%%num_initial_blocks - 1)
+%elif %%num_initial_blocks <= 8
+        vextracti32x4   %%CTR, %%CTR1, (%%num_initial_blocks - 5)
+%elif %%num_initial_blocks <= 12
+        vextracti32x4   %%CTR, %%CTR2, (%%num_initial_blocks - 9)
+%else
+        vextracti32x4   %%CTR, %%CTR3, (%%num_initial_blocks - 13)
+%endif
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+                        %%CTR0, %%CTR1, %%CTR2, %%CTR3, \
+                        %%CTR0, %%CTR1, %%CTR2, %%CTR3, \
+                        %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+
+        ;; load plain/cipher text
+       ZMM_LOAD_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \
+                        %%DAT0, %%DAT1, %%DAT2, %%DAT3, %%MASKREG
+
+        ;; AES rounds and XOR with plain/cipher text
+%assign j 0
+%rep (NROUNDS + 2)
+        vbroadcastf64x2 %%ZT1, [%%GDATA_KEY + (j * 16)]
+        ZMM_AESENC_ROUND_BLOCKS_0_16 %%CTR0, %%CTR1, %%CTR2, %%CTR3, \
+                        %%ZT1, j, \
+                        %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+                        %%num_initial_blocks, NROUNDS
+%assign j (j + 1)
+%endrep
+
+        ;; retrieve the last cipher counter block (partially XOR'ed with text)
+        ;; - this is needed for partial block cases
+%if %%num_initial_blocks <= 4
+        vextracti32x4   %%T1, %%CTR0, (%%num_initial_blocks - 1)
+%elif %%num_initial_blocks <= 8
+        vextracti32x4   %%T1, %%CTR1, (%%num_initial_blocks - 5)
+%elif %%num_initial_blocks <= 12
+        vextracti32x4   %%T1, %%CTR2, (%%num_initial_blocks - 9)
+%else
+        vextracti32x4   %%T1, %%CTR3, (%%num_initial_blocks - 13)
+%endif
+
+        ;; write cipher/plain text back to output and
+        ZMM_STORE_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \
+                        %%CTR0, %%CTR1, %%CTR2, %%CTR3, %%MASKREG
+
+        ;; zero bytes outside the mask before hashing
+%if %%num_initial_blocks <= 4
+        vmovdqu8        %%CTR0{%%MASKREG}{z}, %%CTR0
+%elif %%num_initial_blocks <= 8
+        vmovdqu8        %%CTR1{%%MASKREG}{z}, %%CTR1
+%elif %%num_initial_blocks <= 12
+        vmovdqu8        %%CTR2{%%MASKREG}{z}, %%CTR2
+%else
+        vmovdqu8        %%CTR3{%%MASKREG}{z}, %%CTR3
+%endif
+
+        ;; Shuffle the cipher text blocks for hashing part
+        ;; ZT5 and ZT6 are expected outputs with blocks for hashing
+%ifidn  %%ENC_DEC, DEC
+        ;; Decrypt case
+        ;; - cipher blocks are in ZT5 & ZT6
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+                        %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+                        %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+                        %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+%else
+        ;; Encrypt case
+        ;; - cipher blocks are in CTR0-CTR3
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+                        %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+                        %%CTR0, %%CTR1, %%CTR2, %%CTR3, \
+                        %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+%endif                          ; Encrypt
+
+        ;; Extract the last block for partials and multi_call cases
+%if %%num_initial_blocks <= 4
+        vextracti32x4   %%T7, %%DAT0, %%num_initial_blocks - 1
+%elif %%num_initial_blocks <= 8
+        vextracti32x4   %%T7, %%DAT1, %%num_initial_blocks - 5
+%elif %%num_initial_blocks <= 12
+        vextracti32x4   %%T7, %%DAT2, %%num_initial_blocks - 9
+%else
+        vextracti32x4   %%T7, %%DAT3, %%num_initial_blocks - 13
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Hash all but the last block of data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+        ;; update data offset
+%if %%num_initial_blocks > 1
+        ;; The final block of data may be <16B
+        add     %%DATA_OFFSET, 16 * (%%num_initial_blocks - 1)
+        sub     %%LENGTH, 16 * (%%num_initial_blocks - 1)
+%endif
+
+%if %%num_initial_blocks < 16
+        ;; NOTE: the 'jl' is always taken for num_initial_blocks = 16.
+        ;;      This is run in the context of GCM_ENC_DEC_SMALL for length < 256.
+        cmp     %%LENGTH, 16
+        jl      %%_small_initial_partial_block
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Handle a full length final block - encrypt and hash all blocks
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+        sub     %%LENGTH, 16
+        add     %%DATA_OFFSET, 16
+        mov     [%%GDATA_CTX + PBlockLen], %%LENGTH
+
+        ;; Hash all of the data
+
+        ;; ZT2 - incoming AAD hash (low 128bits)
+        ;; ZT12-ZT20 - temporary registers
+        GHASH_1_TO_16 %%GDATA_KEY, %%HASH_IN_OUT, \
+                        %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, \
+                        %%ZT17, %%ZT18, %%ZT19, %%ZT20, \
+                        %%GH, %%GL, %%GM, \
+                        %%ZT2, %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+                        %%num_initial_blocks
+
+        jmp             %%_small_initial_compute_done
+%endif                          ; %if %%num_initial_blocks < 16
+
+%%_small_initial_partial_block:
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;;; Handle ghash for a <16B final block
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+        ;; In this case if it's a single call to encrypt we can
+        ;; hash all of the data but if it's an init / update / finalize
+        ;; series of call we need to leave the last block if it's
+        ;; less than a full block of data.
+
+        mov             [%%GDATA_CTX + PBlockLen], %%LENGTH
+        ;; %%T1 is ciphered counter block
+        vmovdqu64       [%%GDATA_CTX + PBlockEncKey], %%T1
+
+%ifidn %%INSTANCE_TYPE, multi_call
+%assign k (%%num_initial_blocks - 1)
+%assign last_block_to_hash 1
+%else
+%assign k (%%num_initial_blocks)
+%assign last_block_to_hash 0
+%endif
+
+%if (%%num_initial_blocks > last_block_to_hash)
+
+        ;; ZT12-ZT20 - temporary registers
+        GHASH_1_TO_16 %%GDATA_KEY, %%HASH_IN_OUT, \
+                        %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, \
+                        %%ZT17, %%ZT18, %%ZT19, %%ZT20, \
+                        %%GH, %%GL, %%GM, \
+                        %%ZT2, %%DAT0, %%DAT1, %%DAT2, %%DAT3, k
+
+        ;; just fall through no jmp needed
+%else
+        ;; Record that a reduction is not needed -
+        ;; In this case no hashes are computed because there
+        ;; is only one initial block and it is < 16B in length.
+        ;; We only need to check if a reduction is needed if
+        ;; initial_blocks == 1 and init/update/final is being used.
+        ;; In this case we may just have a partial block, and that
+        ;; gets hashed in finalize.
+
+%assign need_for_reduction 1
+%ifidn %%GH, no_zmm
+%ifidn %%GL, no_zmm
+%ifidn %%GM, no_zmm
+;; if %%GH, %%GL & %%GM not passed then reduction is not required
+%assign need_for_reduction 0
+%endif
+%endif
+%endif
+
+%if need_for_reduction == 0
+        ;; The hash should end up in HASH_IN_OUT.
+        ;; The only way we should get here is if there is
+        ;; a partial block of data, so xor that into the hash.
+        vpxorq          %%HASH_IN_OUT, %%T2, %%T7
+%else
+        ;; right - here we have nothing to ghash in the small data but
+        ;; we have GHASH sums passed through that we need to gather and reduce
+
+        ;; integrate TM into TH and TL
+        vpsrldq         %%ZT12, %%GM, 8
+        vpslldq         %%ZT13, %%GM, 8
+        vpxorq          %%GH, %%GH, %%ZT12
+        vpxorq          %%GL, %%GL, %%ZT13
+
+        ;; add TH and TL 128-bit words horizontally
+        VHPXORI4x128    %%GH, %%ZT12
+        VHPXORI4x128    %%GL, %%ZT13
+
+        ;; reduction
+        vmovdqa64       XWORD(%%ZT12), [rel POLY2]
+        VCLMUL_REDUCE   %%HASH_IN_OUT, XWORD(%%ZT12), \
+                        XWORD(%%GH), XWORD(%%GL), XWORD(%%ZT13), XWORD(%%ZT14)
+
+        vpxorq          %%HASH_IN_OUT, %%HASH_IN_OUT, %%T7
+%endif
+        ;; The result is in %%HASH_IN_OUT
+        jmp             %%_after_reduction
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; After GHASH reduction
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_small_initial_compute_done:
+
+%ifidn %%INSTANCE_TYPE, multi_call
+        ;; If using init/update/finalize, we need to xor any partial block data
+        ;; into the hash.
+%if %%num_initial_blocks > 1
+        ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place
+%if %%num_initial_blocks != 16
+        ;; NOTE: for %%num_initial_blocks = 16, %%LENGTH, stored in [PBlockLen] is never zero
+        or              %%LENGTH, %%LENGTH
+        je              %%_after_reduction
+%endif                          ; %%num_initial_blocks != 16
+        vpxorq          %%HASH_IN_OUT, %%HASH_IN_OUT, %%T7
+%endif                          ; %%num_initial_blocks > 1
+%endif                          ; %%INSTANCE_TYPE, multi_call
+
+%%_after_reduction:
+        ;; Final hash is now in HASH_IN_OUT
+
+%endmacro                       ; INITIAL_BLOCKS_PARTIAL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Main GCM macro stitching cipher with GHASH
+;;; - operates on single stream
+;;; - encrypts 8 blocks at a time
+;;; - ghash the 8 previously encrypted ciphertext blocks
+;;; For partial block case and multi_call , AES_PARTIAL_BLOCK on output
+;;; contains encrypted counter block.
+%macro  GHASH_8_ENCRYPT_8_PARALLEL 34-37
+%define %%GDATA                 %1  ; [in] key pointer
+%define %%CYPH_PLAIN_OUT        %2  ; [in] pointer to output buffer
+%define %%PLAIN_CYPH_IN         %3  ; [in] pointer to input buffer
+%define %%DATA_OFFSET           %4  ; [in] data offset
+%define %%CTR1                  %5  ; [in/out] ZMM counter blocks 0 to 3
+%define %%CTR2                  %6  ; [in/out] ZMM counter blocks 4 to 7
+%define %%GHASHIN_AESOUT_B03    %7  ; [in/out] ZMM ghash in / aes out blocks 0 to 3
+%define %%GHASHIN_AESOUT_B47    %8  ; [in/out] ZMM ghash in / aes out blocks 4 to 7
+%define %%AES_PARTIAL_BLOCK     %9  ; [out] XMM partial block (AES)
+%define %%loop_idx              %10 ; [in] counter block prep selection "add+shuffle" or "add"
+%define %%ENC_DEC               %11 ; [in] cipher direction
+%define %%FULL_PARTIAL          %12 ; [in] last block type selection "full" or "partial"
+%define %%IA0                   %13 ; [clobbered] temporary GP register
+%define %%IA1                   %14 ; [clobbered] temporary GP register
+%define %%LENGTH                %15 ; [in] length
+%define %%INSTANCE_TYPE         %16 ; [in] 'single_call' or 'multi_call' selection
+%define %%GH4KEY                %17 ; [in] ZMM with GHASH keys 4 to 1
+%define %%GH8KEY                %18 ; [in] ZMM with GHASH keys 8 to 5
+%define %%SHFMSK                %19 ; [in] ZMM with byte swap mask for pshufb
+%define %%ZT1                   %20 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT2                   %21 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT3                   %22 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT4                   %23 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT5                   %24 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT10                  %25 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT11                  %26 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT12                  %27 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT13                  %28 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT14                  %29 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT15                  %30 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT16                  %31 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT17                  %32 ; [clobbered] temporary ZMM (ghash)
+%define %%MASKREG               %33 ; [clobbered] mask register for partial loads/stores
+%define %%DO_REDUCTION          %34 ; [in] "reduction", "no_reduction", "final_reduction"
+%define %%TO_REDUCE_L           %35 ; [in/out] ZMM for low 4x128-bit in case of "no_reduction"
+%define %%TO_REDUCE_H           %36 ; [in/out] ZMM for hi 4x128-bit in case of "no_reduction"
+%define %%TO_REDUCE_M           %37 ; [in/out] ZMM for medium 4x128-bit in case of "no_reduction"
+
+%define %%GH1H  %%ZT10
+%define %%GH1L  %%ZT11
+%define %%GH1M1 %%ZT12
+%define %%GH1M2 %%ZT13
+
+%define %%GH2H  %%ZT14
+%define %%GH2L  %%ZT15
+%define %%GH2M1 %%ZT16
+%define %%GH2M2 %%ZT17
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; populate counter blocks for cipher part
+%ifidn %%loop_idx, in_order
+        ;; %%CTR1 & %%CTR2 are shuffled outside the scope of this macro
+        ;; it has to be kept in unshuffled format
+        vpshufb         %%ZT1, %%CTR1, %%SHFMSK
+        vpshufb         %%ZT2, %%CTR2, %%SHFMSK
+%else
+        vmovdqa64       %%ZT1, %%CTR1
+        vmovdqa64       %%ZT2, %%CTR2
+%endif
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; stitch AES rounds with GHASH
+
+%assign aes_round 0
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; AES round 0 - ARK
+        vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+        ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+                        %%ZT3, aes_round, \
+                        %%ZT4, %%ZT5, no_zmm, no_zmm, \
+                        8, NROUNDS
+%assign aes_round (aes_round + 1)
+
+        ;;==================================================
+        ;; GHASH 4 blocks
+        vpclmulqdq      %%GH1H,  %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x11     ; a1*b1
+        vpclmulqdq      %%GH1L,  %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x00     ; a0*b0
+        vpclmulqdq      %%GH1M1, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x01     ; a1*b0
+        vpclmulqdq      %%GH1M2, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x10     ; a0*b1
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; 3 AES rounds
+%rep 3
+        vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+        ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+                        %%ZT3, aes_round, \
+                        %%ZT4, %%ZT5, no_zmm, no_zmm, \
+                        8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep                         ; 3 x AES ROUND
+
+        ;; =================================================
+        ;; GHASH 4 blocks
+        vpclmulqdq      %%GH2M1, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x10     ; a0*b1
+        vpclmulqdq      %%GH2M2, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x01     ; a1*b0
+        vpclmulqdq      %%GH2H,  %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x11     ; a1*b1
+        vpclmulqdq      %%GH2L,  %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x00     ; a0*b0
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; 3 AES rounds
+%rep 3
+        vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+        ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+                        %%ZT3, aes_round, \
+                        %%ZT4, %%ZT5, no_zmm, no_zmm, \
+                        8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep                         ; 3 x AES ROUND
+
+        ;; =================================================
+        ;; gather GHASH in GH1L (low) and GH1H (high)
+%ifidn %%DO_REDUCTION, no_reduction
+        vpternlogq      %%GH1M1, %%GH1M2, %%GH2M1, 0x96       ; TM: GH1M1 ^= GH1M2 ^ GH2M1
+        vpternlogq      %%TO_REDUCE_M, %%GH1M1, %%GH2M2, 0x96 ; TM: TO_REDUCE_M ^= GH1M1 ^ GH2M2
+        vpternlogq      %%TO_REDUCE_H, %%GH1H, %%GH2H, 0x96   ; TH: TO_REDUCE_H ^= GH1H ^ GH2H
+        vpternlogq      %%TO_REDUCE_L, %%GH1L, %%GH2L, 0x96   ; TL: TO_REDUCE_L ^= GH1L ^ GH2L
+%endif
+%ifidn %%DO_REDUCTION, do_reduction
+        ;; phase 1: add mid products together
+        vpternlogq      %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1
+        vpxorq          %%GH1M1, %%GH1M1, %%GH2M2
+
+        vpsrldq         %%GH2M1, %%GH1M1, 8
+        vpslldq         %%GH1M1, %%GH1M1, 8
+%endif
+%ifidn %%DO_REDUCTION, final_reduction
+        ;; phase 1: add mid products together
+        vpternlogq      %%GH1M1, %%GH1M2, %%GH2M1, 0x96       ; TM: GH1M1 ^= GH1M2 ^ GH2M1
+        vpternlogq      %%GH1M1, %%TO_REDUCE_M, %%GH2M2, 0x96 ; TM: GH1M1 ^= TO_REDUCE_M ^ GH2M2
+
+        vpsrldq         %%GH2M1, %%GH1M1, 8
+        vpslldq         %%GH1M1, %%GH1M1, 8
+%endif
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; 2 AES rounds
+%rep 2
+        vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+        ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+                        %%ZT3, aes_round, \
+                        %%ZT4, %%ZT5, no_zmm, no_zmm, \
+                        8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep                         ; 2 x AES ROUND
+
+        ;; =================================================
+        ;; Add mid product to high and low then
+        ;; horizontal xor of low and high 4x128
+%ifidn %%DO_REDUCTION, final_reduction
+        vpternlogq      %%GH1H, %%GH2H, %%GH2M1, 0x96   ; TH = TH1 + TH2 + TM>>64
+        vpxorq          %%GH1H, %%TO_REDUCE_H
+        vpternlogq      %%GH1L, %%GH2L, %%GH1M1, 0x96   ; TL = TL1 + TL2 + TM<<64
+        vpxorq          %%GH1L, %%TO_REDUCE_L
+%endif
+%ifidn %%DO_REDUCTION, do_reduction
+        vpternlogq      %%GH1H, %%GH2H, %%GH2M1, 0x96   ; TH = TH1 + TH2 + TM>>64
+        vpternlogq      %%GH1L, %%GH2L, %%GH1M1, 0x96   ; TL = TL1 + TL2 + TM<<64
+%endif
+%ifnidn %%DO_REDUCTION, no_reduction
+        VHPXORI4x128    %%GH1H, %%GH2H
+        VHPXORI4x128    %%GH1L, %%GH2L
+%endif
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; 2 AES rounds
+%rep 2
+%if (aes_round < (NROUNDS + 1))
+        vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+        ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+                        %%ZT3, aes_round, \
+                        %%ZT4, %%ZT5, no_zmm, no_zmm, \
+                        8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endif                          ; aes_round < (NROUNDS + 1)
+%endrep
+
+        ;; =================================================
+        ;; first phase of reduction
+%ifnidn %%DO_REDUCTION, no_reduction
+        vmovdqu64       XWORD(%%GH2M2), [rel POLY2]
+        vpclmulqdq      XWORD(%%ZT15), XWORD(%%GH2M2), XWORD(%%GH1L), 0x01
+        vpslldq         XWORD(%%ZT15), XWORD(%%ZT15), 8             ; shift-L 2 DWs
+        vpxorq          XWORD(%%ZT15), XWORD(%%GH1L), XWORD(%%ZT15) ; first phase of the reduct
+%endif
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; 2 AES rounds
+%rep 2
+%if (aes_round < (NROUNDS + 1))
+        vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+        ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+                        %%ZT3, aes_round, \
+                        %%ZT4, %%ZT5, no_zmm, no_zmm, \
+                        8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endif                          ; aes_round < (NROUNDS + 1)
+%endrep
+
+        ;; =================================================
+        ;; second phase of the reduction
+%ifnidn %%DO_REDUCTION, no_reduction
+        vpclmulqdq      XWORD(%%ZT16), XWORD(%%GH2M2), XWORD(%%ZT15), 0x00
+        vpsrldq         XWORD(%%ZT16), XWORD(%%ZT16), 4 ; shift-R 1-DW to obtain 2-DWs shift-R
+
+        vpclmulqdq      XWORD(%%ZT13), XWORD(%%GH2M2), XWORD(%%ZT15), 0x10
+        vpslldq         XWORD(%%ZT13), XWORD(%%ZT13), 4 ; shift-L 1-DW for result without shifts
+        ;; ZT13 = ZT13 xor ZT16 xor GH1H
+        vpternlogq      XWORD(%%ZT13), XWORD(%%ZT16), XWORD(%%GH1H), 0x96
+%endif
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; all remaining AES rounds but the last
+%rep (NROUNDS + 2)
+%if (aes_round < (NROUNDS + 1))
+        vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+        ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+                        %%ZT3, aes_round, \
+                        %%ZT4, %%ZT5, no_zmm, no_zmm, \
+                        8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endif                          ; aes_round < (NROUNDS + 1)
+%endrep
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; load/store mask (partial case) and load the text data
+%ifidn %%FULL_PARTIAL, full
+        vmovdqu8        %%ZT4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+        vmovdqu8        %%ZT5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64]
+%else
+        lea             %%IA0, [rel byte64_len_to_mask_table]
+        mov             %%IA1, %%LENGTH
+        sub             %%IA1, 64
+        kmovq           %%MASKREG, [%%IA0 + 8*%%IA1]
+        vmovdqu8        %%ZT4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+        vmovdqu8        %%ZT5{%%MASKREG}{z}, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64]
+%endif
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; the last AES round  (NROUNDS + 1) and XOR against plain/cipher text
+        vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+        ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+                        %%ZT3, aes_round, \
+                        %%ZT4, %%ZT5, no_zmm, no_zmm, \
+                        8, NROUNDS
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; store the cipher/plain text data
+%ifidn %%FULL_PARTIAL, full
+        vmovdqu8        [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1
+        vmovdqu8        [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64], %%ZT2
+%else
+        vmovdqu8        [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1
+        vmovdqu8        [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64]{%%MASKREG}, %%ZT2
+%endif
+
+        ;; =================================================
+        ;; prep cipher text blocks for the next ghash round
+
+%ifnidn %%FULL_PARTIAL, full
+%ifidn %%INSTANCE_TYPE, multi_call
+        ;; for partial block & multi_call we need encrypted counter block
+        vpxorq          %%ZT3, %%ZT2, %%ZT5
+        vextracti32x4   %%AES_PARTIAL_BLOCK, %%ZT3, 3
+%endif
+        ;; for GHASH computation purpose clear the top bytes of the partial block
+%ifidn %%ENC_DEC, ENC
+        vmovdqu8        %%ZT2{%%MASKREG}{z}, %%ZT2
+%else
+        vmovdqu8        %%ZT5{%%MASKREG}{z}, %%ZT5
+%endif
+%endif  ; %ifnidn %%FULL_PARTIAL, full
+
+        ;; =================================================
+        ;; shuffle cipher text blocks for GHASH computation
+%ifidn %%ENC_DEC, ENC
+        vpshufb         %%GHASHIN_AESOUT_B03, %%ZT1, %%SHFMSK
+        vpshufb         %%GHASHIN_AESOUT_B47, %%ZT2, %%SHFMSK
+%else
+        vpshufb         %%GHASHIN_AESOUT_B03, %%ZT4, %%SHFMSK
+        vpshufb         %%GHASHIN_AESOUT_B47, %%ZT5, %%SHFMSK
+%endif
+
+%ifidn %%DO_REDUCTION, do_reduction
+        ;; =================================================
+        ;; XOR current GHASH value (ZT13) into block 0
+        vpxorq          %%GHASHIN_AESOUT_B03, %%ZT13
+%endif
+%ifidn %%DO_REDUCTION, final_reduction
+        ;; =================================================
+        ;; Return GHASH value (ZT13) in TO_REDUCE_L
+        vmovdqa64       %%TO_REDUCE_L, %%ZT13
+%endif
+
+%endmacro                       ; GHASH_8_ENCRYPT_8_PARALLEL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Main GCM macro stitching cipher with GHASH
+;;; - operates on single stream
+;;; - encrypts 16 blocks at a time
+;;; - ghash the 16 previously encrypted ciphertext blocks
+;;; - no partial block or multi_call handling here
+%macro  GHASH_16_ENCRYPT_16_PARALLEL 42
+%define %%GDATA                 %1  ; [in] key pointer
+%define %%CYPH_PLAIN_OUT        %2  ; [in] pointer to output buffer
+%define %%PLAIN_CYPH_IN         %3  ; [in] pointer to input buffer
+%define %%DATA_OFFSET           %4  ; [in] data offset
+%define %%CTR_BE                %5  ; [in/out] ZMM counter blocks (last 4) in big-endian
+%define %%CTR_CHECK             %6  ; [in/out] GP with 8-bit counter for overflow check
+%define %%HASHKEY_OFFSET        %7  ; [in] numerical offset for the highest hash key
+%define %%AESOUT_BLK_OFFSET     %8  ; [in] numerical offset for AES-CTR out
+%define %%GHASHIN_BLK_OFFSET    %9  ; [in] numerical offset for GHASH blocks in
+%define %%SHFMSK                %10 ; [in] ZMM with byte swap mask for pshufb
+%define %%ZT1                   %11 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT2                   %12 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT3                   %13 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT4                   %14 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT5                   %15 ; [clobbered/out] temporary ZMM or GHASH OUT (final_reduction)
+%define %%ZT6                   %16 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT7                   %17 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT8                   %18 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT9                   %19 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT10                  %20 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT11                  %21 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT12                  %22 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT13                  %23 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT14                  %24 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT15                  %25 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT16                  %26 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT17                  %27 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT18                  %28 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT19                  %29 ; [clobbered] temporary ZMM
+%define %%ZT20                  %30 ; [clobbered] temporary ZMM
+%define %%ZT21                  %31 ; [clobbered] temporary ZMM
+%define %%ZT22                  %32 ; [clobbered] temporary ZMM
+%define %%ZT23                  %33 ; [clobbered] temporary ZMM
+%define %%ADDBE_4x4             %34 ; [in] ZMM with 4x128bits 4 in big-endian
+%define %%ADDBE_1234            %35 ; [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
+%define %%TO_REDUCE_L           %36 ; [in/out] ZMM for low 4x128-bit GHASH sum
+%define %%TO_REDUCE_H           %37 ; [in/out] ZMM for hi 4x128-bit GHASH sum
+%define %%TO_REDUCE_M           %38 ; [in/out] ZMM for medium 4x128-bit GHASH sum
+%define %%DO_REDUCTION          %39 ; [in] "no_reduction", "final_reduction", "first_time"
+%define %%ENC_DEC               %40 ; [in] cipher direction
+%define %%DATA_DISPL            %41 ; [in] fixed numerical data displacement/offset
+%define %%GHASH_IN              %42 ; [in] current GHASH value or "no_ghash_in"
+
+%define %%B00_03 %%ZT1
+%define %%B04_07 %%ZT2
+%define %%B08_11 %%ZT3
+%define %%B12_15 %%ZT4
+
+%define %%GH1H  %%ZT5 ; @note: do not change this mapping
+%define %%GH1L  %%ZT6
+%define %%GH1M  %%ZT7
+%define %%GH1T  %%ZT8
+
+%define %%GH2H  %%ZT9
+%define %%GH2L  %%ZT10
+%define %%GH2M  %%ZT11
+%define %%GH2T  %%ZT12
+
+%define %%RED_POLY %%GH2T
+%define %%RED_P1   %%GH2L
+%define %%RED_T1   %%GH2H
+%define %%RED_T2   %%GH2M
+
+%define %%GH3H  %%ZT13
+%define %%GH3L  %%ZT14
+%define %%GH3M  %%ZT15
+%define %%GH3T  %%ZT16
+
+%define %%DATA1 %%ZT13
+%define %%DATA2 %%ZT14
+%define %%DATA3 %%ZT15
+%define %%DATA4 %%ZT16
+
+%define %%AESKEY1  %%ZT17
+%define %%AESKEY2  %%ZT18
+
+%define %%GHKEY1  %%ZT19
+%define %%GHKEY2  %%ZT20
+%define %%GHDAT1  %%ZT21
+%define %%GHDAT2  %%ZT22
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; prepare counter blocks
+
+        cmp             BYTE(%%CTR_CHECK), (256 - 16)
+        jae             %%_16_blocks_overflow
+        vpaddd          %%B00_03, %%CTR_BE, %%ADDBE_1234
+        vpaddd          %%B04_07, %%B00_03, %%ADDBE_4x4
+        vpaddd          %%B08_11, %%B04_07, %%ADDBE_4x4
+        vpaddd          %%B12_15, %%B08_11, %%ADDBE_4x4
+        jmp             %%_16_blocks_ok
+%%_16_blocks_overflow:
+        vpshufb         %%CTR_BE, %%CTR_BE, %%SHFMSK
+        vmovdqa64       %%B12_15, [rel ddq_add_4444]
+        vpaddd          %%B00_03, %%CTR_BE, [rel ddq_add_1234]
+        vpaddd          %%B04_07, %%B00_03, %%B12_15
+        vpaddd          %%B08_11, %%B04_07, %%B12_15
+        vpaddd          %%B12_15, %%B08_11, %%B12_15
+        vpshufb         %%B00_03, %%SHFMSK
+        vpshufb         %%B04_07, %%SHFMSK
+        vpshufb         %%B08_11, %%SHFMSK
+        vpshufb         %%B12_15, %%SHFMSK
+%%_16_blocks_ok:
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; pre-load constants
+        vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 0)]
+%ifnidn %%GHASH_IN, no_ghash_in
+        vpxorq          %%GHDAT1, %%GHASH_IN, [rsp + %%GHASHIN_BLK_OFFSET + (0*64)]
+%else
+        vmovdqa64       %%GHDAT1, [rsp + %%GHASHIN_BLK_OFFSET + (0*64)]
+%endif
+        vmovdqu64       %%GHKEY1, [%%GDATA + %%HASHKEY_OFFSET + (0*64)]
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; save counter for the next round
+        ;; increment counter overflow check register
+        vshufi64x2      %%CTR_BE, %%B12_15, %%B12_15, 1111_1111b
+        add             BYTE(%%CTR_CHECK), 16
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; pre-load constants
+        vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 1)]
+        vmovdqu64       %%GHKEY2, [%%GDATA + %%HASHKEY_OFFSET + (1*64)]
+        vmovdqa64       %%GHDAT2, [rsp + %%GHASHIN_BLK_OFFSET + (1*64)]
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; stitch AES rounds with GHASH
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; AES round 0 - ARK
+
+        vpxorq          %%B00_03, %%AESKEY1
+        vpxorq          %%B04_07, %%AESKEY1
+        vpxorq          %%B08_11, %%AESKEY1
+        vpxorq          %%B12_15, %%AESKEY1
+        vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 2)]
+
+        ;;==================================================
+        ;; GHASH 4 blocks (15 to 12)
+        vpclmulqdq      %%GH1H, %%GHDAT1, %%GHKEY1, 0x11     ; a1*b1
+        vpclmulqdq      %%GH1L, %%GHDAT1, %%GHKEY1, 0x00     ; a0*b0
+        vpclmulqdq      %%GH1M, %%GHDAT1, %%GHKEY1, 0x01     ; a1*b0
+        vpclmulqdq      %%GH1T, %%GHDAT1, %%GHKEY1, 0x10     ; a0*b1
+
+        vmovdqu64       %%GHKEY1, [%%GDATA + %%HASHKEY_OFFSET + (2*64)]
+        vmovdqa64       %%GHDAT1, [rsp + %%GHASHIN_BLK_OFFSET + (2*64)]
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; AES round 1
+        vaesenc         %%B00_03, %%B00_03, %%AESKEY2
+        vaesenc         %%B04_07, %%B04_07, %%AESKEY2
+        vaesenc         %%B08_11, %%B08_11, %%AESKEY2
+        vaesenc         %%B12_15, %%B12_15, %%AESKEY2
+        vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 3)]
+
+        ;; =================================================
+        ;; GHASH 4 blocks (11 to 8)
+        vpclmulqdq      %%GH2M, %%GHDAT2, %%GHKEY2, 0x10     ; a0*b1
+        vpclmulqdq      %%GH2T, %%GHDAT2, %%GHKEY2, 0x01     ; a1*b0
+        vpclmulqdq      %%GH2H, %%GHDAT2, %%GHKEY2, 0x11     ; a1*b1
+        vpclmulqdq      %%GH2L, %%GHDAT2, %%GHKEY2, 0x00     ; a0*b0
+
+        vmovdqu64       %%GHKEY2, [%%GDATA + %%HASHKEY_OFFSET + (3*64)]
+        vmovdqa64       %%GHDAT2, [rsp + %%GHASHIN_BLK_OFFSET + (3*64)]
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; AES round 2
+        vaesenc         %%B00_03, %%B00_03, %%AESKEY1
+        vaesenc         %%B04_07, %%B04_07, %%AESKEY1
+        vaesenc         %%B08_11, %%B08_11, %%AESKEY1
+        vaesenc         %%B12_15, %%B12_15, %%AESKEY1
+        vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 4)]
+
+        ;; =================================================
+        ;; GHASH 4 blocks (7 to 4)
+        vpclmulqdq      %%GH3M, %%GHDAT1, %%GHKEY1, 0x10     ; a0*b1
+        vpclmulqdq      %%GH3T, %%GHDAT1, %%GHKEY1, 0x01     ; a1*b0
+        vpclmulqdq      %%GH3H, %%GHDAT1, %%GHKEY1, 0x11     ; a1*b1
+        vpclmulqdq      %%GH3L, %%GHDAT1, %%GHKEY1, 0x00     ; a0*b0
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; AES rounds 3
+        vaesenc         %%B00_03, %%B00_03, %%AESKEY2
+        vaesenc         %%B04_07, %%B04_07, %%AESKEY2
+        vaesenc         %%B08_11, %%B08_11, %%AESKEY2
+        vaesenc         %%B12_15, %%B12_15, %%AESKEY2
+        vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 5)]
+
+        ;; =================================================
+        ;; Gather (XOR) GHASH for 12 blocks
+        vpternlogq      %%GH1H, %%GH2H, %%GH3H, 0x96
+        vpternlogq      %%GH1L, %%GH2L, %%GH3L, 0x96
+        vpternlogq      %%GH1T, %%GH2T, %%GH3T, 0x96
+        vpternlogq      %%GH1M, %%GH2M, %%GH3M, 0x96
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; AES rounds 4
+        vaesenc         %%B00_03, %%B00_03, %%AESKEY1
+        vaesenc         %%B04_07, %%B04_07, %%AESKEY1
+        vaesenc         %%B08_11, %%B08_11, %%AESKEY1
+        vaesenc         %%B12_15, %%B12_15, %%AESKEY1
+        vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 6)]
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; load plain/cipher text (recycle GH3xx registers)
+        VX512LDR        %%DATA1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (0 * 64)]
+        VX512LDR        %%DATA2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (1 * 64)]
+        VX512LDR        %%DATA3, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (2 * 64)]
+        VX512LDR        %%DATA4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (3 * 64)]
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; AES rounds 5
+        vaesenc         %%B00_03, %%B00_03, %%AESKEY2
+        vaesenc         %%B04_07, %%B04_07, %%AESKEY2
+        vaesenc         %%B08_11, %%B08_11, %%AESKEY2
+        vaesenc         %%B12_15, %%B12_15, %%AESKEY2
+        vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 7)]
+
+        ;; =================================================
+        ;; GHASH 4 blocks (3 to 0)
+        vpclmulqdq      %%GH2M, %%GHDAT2, %%GHKEY2, 0x10     ; a0*b1
+        vpclmulqdq      %%GH2T, %%GHDAT2, %%GHKEY2, 0x01     ; a1*b0
+        vpclmulqdq      %%GH2H, %%GHDAT2, %%GHKEY2, 0x11     ; a1*b1
+        vpclmulqdq      %%GH2L, %%GHDAT2, %%GHKEY2, 0x00     ; a0*b0
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; AES round 6
+        vaesenc         %%B00_03, %%B00_03, %%AESKEY1
+        vaesenc         %%B04_07, %%B04_07, %%AESKEY1
+        vaesenc         %%B08_11, %%B08_11, %%AESKEY1
+        vaesenc         %%B12_15, %%B12_15, %%AESKEY1
+        vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 8)]
+
+        ;; =================================================
+        ;; gather GHASH in GH1L (low) and GH1H (high)
+%ifidn %%DO_REDUCTION, first_time
+        vpternlogq      %%GH1M, %%GH1T, %%GH2T, 0x96        ; TM
+        vpxorq          %%TO_REDUCE_M, %%GH1M, %%GH2M       ; TM
+        vpxorq          %%TO_REDUCE_H, %%GH1H, %%GH2H       ; TH
+        vpxorq          %%TO_REDUCE_L, %%GH1L, %%GH2L       ; TL
+%endif
+%ifidn %%DO_REDUCTION, no_reduction
+        vpternlogq      %%GH1M, %%GH1T, %%GH2T, 0x96        ; TM
+        vpternlogq      %%TO_REDUCE_M, %%GH1M, %%GH2M, 0x96 ; TM
+        vpternlogq      %%TO_REDUCE_H, %%GH1H, %%GH2H, 0x96 ; TH
+        vpternlogq      %%TO_REDUCE_L, %%GH1L, %%GH2L, 0x96 ; TL
+%endif
+%ifidn %%DO_REDUCTION, final_reduction
+        ;; phase 1: add mid products together
+        ;; also load polynomial constant for reduction
+        vpternlogq      %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM
+        vpternlogq      %%GH1M, %%TO_REDUCE_M, %%GH2M, 0x96
+
+        vpsrldq         %%GH2M, %%GH1M, 8
+        vpslldq         %%GH1M, %%GH1M, 8
+
+        vmovdqa64       XWORD(%%RED_POLY), [rel POLY2]
+%endif
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; AES round 7
+        vaesenc         %%B00_03, %%B00_03, %%AESKEY2
+        vaesenc         %%B04_07, %%B04_07, %%AESKEY2
+        vaesenc         %%B08_11, %%B08_11, %%AESKEY2
+        vaesenc         %%B12_15, %%B12_15, %%AESKEY2
+        vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 9)]
+
+        ;; =================================================
+        ;; Add mid product to high and low
+%ifidn %%DO_REDUCTION, final_reduction
+        vpternlogq      %%GH1H, %%GH2H, %%GH2M, 0x96    ; TH = TH1 + TH2 + TM>>64
+        vpxorq          %%GH1H, %%TO_REDUCE_H
+        vpternlogq      %%GH1L, %%GH2L, %%GH1M, 0x96    ; TL = TL1 + TL2 + TM<<64
+        vpxorq          %%GH1L, %%TO_REDUCE_L
+%endif
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; AES round 8
+        vaesenc         %%B00_03, %%B00_03, %%AESKEY1
+        vaesenc         %%B04_07, %%B04_07, %%AESKEY1
+        vaesenc         %%B08_11, %%B08_11, %%AESKEY1
+        vaesenc         %%B12_15, %%B12_15, %%AESKEY1
+        vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 10)]
+
+        ;; =================================================
+        ;; horizontal xor of low and high 4x128
+%ifidn %%DO_REDUCTION, final_reduction
+        VHPXORI4x128    %%GH1H, %%GH2H
+        VHPXORI4x128    %%GH1L, %%GH2L
+%endif
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; AES round 9
+        vaesenc         %%B00_03, %%B00_03, %%AESKEY2
+        vaesenc         %%B04_07, %%B04_07, %%AESKEY2
+        vaesenc         %%B08_11, %%B08_11, %%AESKEY2
+        vaesenc         %%B12_15, %%B12_15, %%AESKEY2
+%if (NROUNDS >= 11)
+        vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 11)]
+%endif
+        ;; =================================================
+        ;; first phase of reduction
+%ifidn %%DO_REDUCTION, final_reduction
+        vpclmulqdq      XWORD(%%RED_P1), XWORD(%%RED_POLY), XWORD(%%GH1L), 0x01
+        vpslldq         XWORD(%%RED_P1), XWORD(%%RED_P1), 8             ; shift-L 2 DWs
+        vpxorq          XWORD(%%RED_P1), XWORD(%%GH1L), XWORD(%%RED_P1) ; first phase of the reduct
+%endif
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; AES rounds up to 11 (AES192) or 13 (AES256)
+        ;; AES128 is done
+%if (NROUNDS >= 11)
+        vaesenc         %%B00_03, %%B00_03, %%AESKEY1
+        vaesenc         %%B04_07, %%B04_07, %%AESKEY1
+        vaesenc         %%B08_11, %%B08_11, %%AESKEY1
+        vaesenc         %%B12_15, %%B12_15, %%AESKEY1
+        vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 12)]
+
+        vaesenc         %%B00_03, %%B00_03, %%AESKEY2
+        vaesenc         %%B04_07, %%B04_07, %%AESKEY2
+        vaesenc         %%B08_11, %%B08_11, %%AESKEY2
+        vaesenc         %%B12_15, %%B12_15, %%AESKEY2
+%if (NROUNDS == 13)
+        vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 13)]
+
+        vaesenc         %%B00_03, %%B00_03, %%AESKEY1
+        vaesenc         %%B04_07, %%B04_07, %%AESKEY1
+        vaesenc         %%B08_11, %%B08_11, %%AESKEY1
+        vaesenc         %%B12_15, %%B12_15, %%AESKEY1
+        vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 14)]
+
+        vaesenc         %%B00_03, %%B00_03, %%AESKEY2
+        vaesenc         %%B04_07, %%B04_07, %%AESKEY2
+        vaesenc         %%B08_11, %%B08_11, %%AESKEY2
+        vaesenc         %%B12_15, %%B12_15, %%AESKEY2
+%endif ; GCM256 / NROUNDS = 13 (15 including the first and the last)
+%endif ; GCM192 / NROUNDS = 11 (13 including the first and the last)
+
+        ;; =================================================
+        ;; second phase of the reduction
+%ifidn %%DO_REDUCTION, final_reduction
+        vpclmulqdq      XWORD(%%RED_T1), XWORD(%%RED_POLY), XWORD(%%RED_P1), 0x00
+        vpsrldq         XWORD(%%RED_T1), XWORD(%%RED_T1), 4 ; shift-R 1-DW to obtain 2-DWs shift-R
+
+        vpclmulqdq      XWORD(%%RED_T2), XWORD(%%RED_POLY), XWORD(%%RED_P1), 0x10
+        vpslldq         XWORD(%%RED_T2), XWORD(%%RED_T2), 4 ; shift-L 1-DW for result without shifts
+        ;; GH1H = GH1H x RED_T1 x RED_T2
+        vpternlogq      XWORD(%%GH1H), XWORD(%%RED_T2), XWORD(%%RED_T1), 0x96
+%endif
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; the last AES round
+        vaesenclast     %%B00_03, %%B00_03, %%AESKEY1
+        vaesenclast     %%B04_07, %%B04_07, %%AESKEY1
+        vaesenclast     %%B08_11, %%B08_11, %%AESKEY1
+        vaesenclast     %%B12_15, %%B12_15, %%AESKEY1
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; XOR against plain/cipher text
+        vpxorq          %%B00_03, %%B00_03, %%DATA1
+        vpxorq          %%B04_07, %%B04_07, %%DATA2
+        vpxorq          %%B08_11, %%B08_11, %%DATA3
+        vpxorq          %%B12_15, %%B12_15, %%DATA4
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; store cipher/plain text
+        VX512STR        [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (0 * 64)], %%B00_03
+        VX512STR        [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (1 * 64)], %%B04_07
+        VX512STR        [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (2 * 64)], %%B08_11
+        VX512STR        [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (3 * 64)], %%B12_15
+
+        ;; =================================================
+        ;; shuffle cipher text blocks for GHASH computation
+%ifidn %%ENC_DEC, ENC
+        vpshufb         %%B00_03, %%B00_03, %%SHFMSK
+        vpshufb         %%B04_07, %%B04_07, %%SHFMSK
+        vpshufb         %%B08_11, %%B08_11, %%SHFMSK
+        vpshufb         %%B12_15, %%B12_15, %%SHFMSK
+%else
+        vpshufb         %%B00_03, %%DATA1, %%SHFMSK
+        vpshufb         %%B04_07, %%DATA2, %%SHFMSK
+        vpshufb         %%B08_11, %%DATA3, %%SHFMSK
+        vpshufb         %%B12_15, %%DATA4, %%SHFMSK
+%endif
+
+        ;; =================================================
+        ;; store shuffled cipher text for ghashing
+        vmovdqa64       [rsp + %%AESOUT_BLK_OFFSET + (0*64)], %%B00_03
+        vmovdqa64       [rsp + %%AESOUT_BLK_OFFSET + (1*64)], %%B04_07
+        vmovdqa64       [rsp + %%AESOUT_BLK_OFFSET + (2*64)], %%B08_11
+        vmovdqa64       [rsp + %%AESOUT_BLK_OFFSET + (3*64)], %%B12_15
+
+%ifidn %%DO_REDUCTION, final_reduction
+        ;; =================================================
+        ;; Return GHASH value  through %%GH1H
+%endif
+
+%endmacro                       ; GHASH_16_ENCRYPT_16_PARALLEL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; GHASH the last 8 ciphertext blocks.
+;;; - optionally accepts GHASH product sums as input
+%macro  GHASH_LAST_8 10-13
+%define %%GDATA         %1      ; [in] key pointer
+%define %%BL47          %2      ; [in/clobbered] ZMM AES blocks 4 to 7
+%define %%BL03          %3      ; [in/cloberred] ZMM AES blocks 0 to 3
+%define %%ZTH           %4      ; [cloberred] ZMM temporary
+%define %%ZTM           %5      ; [cloberred] ZMM temporary
+%define %%ZTL           %6      ; [cloberred] ZMM temporary
+%define %%ZT01          %7      ; [cloberred] ZMM temporary
+%define %%ZT02          %8      ; [cloberred] ZMM temporary
+%define %%ZT03          %9      ; [cloberred] ZMM temporary
+%define %%AAD_HASH      %10     ; [out] XMM hash value
+%define %%GH            %11     ; [in/optional] ZMM with GHASH high product sum
+%define %%GL            %12     ; [in/optional] ZMM with GHASH low product sum
+%define %%GM            %13     ; [in/optional] ZMM with GHASH mid product sum
+
+        VCLMUL_STEP1    %%GDATA, %%BL47, %%ZT01, %%ZTH, %%ZTM, %%ZTL
+
+%if %0 > 10
+        ;; add optional sums before step2
+        vpxorq          %%ZTH, %%ZTH, %%GH
+        vpxorq          %%ZTL, %%ZTL, %%GL
+        vpxorq          %%ZTM, %%ZTM, %%GM
+%endif
+
+        VCLMUL_STEP2    %%GDATA, %%BL47, %%BL03, %%ZT01, %%ZT02, %%ZT03, %%ZTH, %%ZTM, %%ZTL
+
+        vmovdqa64       XWORD(%%ZT03), [rel POLY2]
+        VCLMUL_REDUCE   %%AAD_HASH, XWORD(%%ZT03), XWORD(%%BL47), XWORD(%%BL03), \
+                XWORD(%%ZT01), XWORD(%%ZT02)
+%endmacro                       ; GHASH_LAST_8
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; GHASH the last 7 cipher text blocks.
+;;; - it uses same GHASH macros as GHASH_LAST_8 but with some twist
+;;; - it loads GHASH keys for each of the data blocks, so that:
+;;;     - blocks 4, 5 and 6 will use GHASH keys 3, 2, 1 respectively
+;;;     - code ensures that unused block 7 and corresponding GHASH key are zeroed
+;;;       (clmul product is zero this way and will not affect the result)
+;;;     - blocks 0, 1, 2 and 3 will use USE GHASH keys 7, 6, 5 and 4 respectively
+;;; - optionally accepts GHASH product sums as input
+%macro  GHASH_LAST_7 13-16
+%define %%GDATA         %1      ; [in] key pointer
+%define %%BL47          %2      ; [in/clobbered] ZMM AES blocks 4 to 7
+%define %%BL03          %3      ; [in/cloberred] ZMM AES blocks 0 to 3
+%define %%ZTH           %4      ; [cloberred] ZMM temporary
+%define %%ZTM           %5      ; [cloberred] ZMM temporary
+%define %%ZTL           %6      ; [cloberred] ZMM temporary
+%define %%ZT01          %7      ; [cloberred] ZMM temporary
+%define %%ZT02          %8      ; [cloberred] ZMM temporary
+%define %%ZT03          %9      ; [cloberred] ZMM temporary
+%define %%ZT04          %10     ; [cloberred] ZMM temporary
+%define %%AAD_HASH      %11     ; [out] XMM hash value
+%define %%MASKREG       %12     ; [clobbered] mask register to use for loads
+%define %%IA0           %13     ; [clobbered] GP temporary register
+%define %%GH            %14     ; [in/optional] ZMM with GHASH high product sum
+%define %%GL            %15     ; [in/optional] ZMM with GHASH low product sum
+%define %%GM            %16     ; [in/optional] ZMM with GHASH mid product sum
+
+        vmovdqa64       XWORD(%%ZT04), [rel POLY2]
+
+        VCLMUL_1_TO_8_STEP1 %%GDATA, %%BL47, %%ZT01, %%ZT02, %%ZTH, %%ZTM, %%ZTL, 7
+
+%if %0 > 13
+        ;; add optional sums before step2
+        vpxorq          %%ZTH, %%ZTH, %%GH
+        vpxorq          %%ZTL, %%ZTL, %%GL
+        vpxorq          %%ZTM, %%ZTM, %%GM
+%endif
+
+        VCLMUL_1_TO_8_STEP2 %%GDATA, %%BL47, %%BL03, \
+                %%ZT01, %%ZT02, %%ZT03, \
+                %%ZTH, %%ZTM, %%ZTL, 7
+
+        VCLMUL_REDUCE   %%AAD_HASH, XWORD(%%ZT04), XWORD(%%BL47), XWORD(%%BL03), \
+                XWORD(%%ZT01), XWORD(%%ZT02)
+%endmacro                       ; GHASH_LAST_7
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Encryption of a single block
+%macro  ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0  %2
+
+                vpxorq          %%XMM0, %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep NROUNDS
+                vaesenc         %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+                vaesenclast     %%XMM0, [%%GDATA+16*i]
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Save register content for the caller
+%macro FUNC_SAVE 0
+        ;; Required for Update/GMC_ENC
+        ;the number of pushes must equal STACK_OFFSET
+        mov     rax, rsp
+
+        sub     rsp, STACK_FRAME_SIZE
+        and     rsp, ~63
+
+        mov     [rsp + STACK_GP_OFFSET + 0*8], r12
+        mov     [rsp + STACK_GP_OFFSET + 1*8], r13
+        mov     [rsp + STACK_GP_OFFSET + 2*8], r14
+        mov     [rsp + STACK_GP_OFFSET + 3*8], r15
+        mov     [rsp + STACK_GP_OFFSET + 4*8], rax ; stack
+        mov     r14, rax                               ; r14 is used to retrieve stack args
+        mov     [rsp + STACK_GP_OFFSET + 5*8], rbp
+        mov     [rsp + STACK_GP_OFFSET + 6*8], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+        mov     [rsp + STACK_GP_OFFSET + 7*8], rdi
+        mov     [rsp + STACK_GP_OFFSET + 8*8], rsi
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+        ; xmm6:xmm15 need to be maintained for Windows
+        vmovdqu [rsp + STACK_XMM_OFFSET + 0*16], xmm6
+        vmovdqu [rsp + STACK_XMM_OFFSET + 1*16], xmm7
+        vmovdqu [rsp + STACK_XMM_OFFSET + 2*16], xmm8
+        vmovdqu [rsp + STACK_XMM_OFFSET + 3*16], xmm9
+        vmovdqu [rsp + STACK_XMM_OFFSET + 4*16], xmm10
+        vmovdqu [rsp + STACK_XMM_OFFSET + 5*16], xmm11
+        vmovdqu [rsp + STACK_XMM_OFFSET + 6*16], xmm12
+        vmovdqu [rsp + STACK_XMM_OFFSET + 7*16], xmm13
+        vmovdqu [rsp + STACK_XMM_OFFSET + 8*16], xmm14
+        vmovdqu [rsp + STACK_XMM_OFFSET + 9*16], xmm15
+%endif
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Restore register content for the caller
+%macro FUNC_RESTORE 0
+
+%ifdef SAFE_DATA
+        clear_scratch_gps_asm
+        clear_scratch_zmms_asm
+%else
+        vzeroupper
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+        vmovdqu xmm15, [rsp + STACK_XMM_OFFSET + 9*16]
+        vmovdqu xmm14, [rsp + STACK_XMM_OFFSET + 8*16]
+        vmovdqu xmm13, [rsp + STACK_XMM_OFFSET + 7*16]
+        vmovdqu xmm12, [rsp + STACK_XMM_OFFSET + 6*16]
+        vmovdqu xmm11, [rsp + STACK_XMM_OFFSET + 5*16]
+        vmovdqu xmm10, [rsp + STACK_XMM_OFFSET + 4*16]
+        vmovdqu xmm9, [rsp + STACK_XMM_OFFSET + 3*16]
+        vmovdqu xmm8, [rsp + STACK_XMM_OFFSET + 2*16]
+        vmovdqu xmm7, [rsp + STACK_XMM_OFFSET + 1*16]
+        vmovdqu xmm6, [rsp + STACK_XMM_OFFSET + 0*16]
+%endif
+
+        ;; Required for Update/GMC_ENC
+        mov     rbp, [rsp + STACK_GP_OFFSET + 5*8]
+        mov     rbx, [rsp + STACK_GP_OFFSET + 6*8]
+%ifidn __OUTPUT_FORMAT__, win64
+        mov     rdi, [rsp + STACK_GP_OFFSET + 7*8]
+        mov     rsi, [rsp + STACK_GP_OFFSET + 8*8]
+%endif
+        mov     r12, [rsp + STACK_GP_OFFSET + 0*8]
+        mov     r13, [rsp + STACK_GP_OFFSET + 1*8]
+        mov     r14, [rsp + STACK_GP_OFFSET + 2*8]
+        mov     r15, [rsp + STACK_GP_OFFSET + 3*8]
+        mov     rsp, [rsp + STACK_GP_OFFSET + 4*8] ; stack
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
+;;; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
+;;; Additional Authentication data (A_IN), Additional Data length (A_LEN).
+;;; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro  GCM_INIT        21
+%define %%GDATA_KEY     %1      ; [in] GCM expanded keys pointer
+%define %%GDATA_CTX     %2      ; [in] GCM context pointer
+%define %%IV            %3      ; [in] IV pointer
+%define %%A_IN          %4      ; [in] AAD pointer
+%define %%A_LEN         %5      ; [in] AAD length in bytes
+%define %%GPR1          %6      ; [clobbered] GP register
+%define %%GPR2          %7      ; [clobbered] GP register
+%define %%GPR3          %8      ; [clobbered] GP register
+%define %%MASKREG       %9      ; [clobbered] mask register
+%define %%AAD_HASH      %10     ; [out] XMM for AAD_HASH value (xmm14)
+%define %%CUR_COUNT     %11     ; [out] XMM with current counter (xmm2)
+%define %%ZT0           %12     ; [clobbered] ZMM register
+%define %%ZT1           %13     ; [clobbered] ZMM register
+%define %%ZT2           %14     ; [clobbered] ZMM register
+%define %%ZT3           %15     ; [clobbered] ZMM register
+%define %%ZT4           %16     ; [clobbered] ZMM register
+%define %%ZT5           %17     ; [clobbered] ZMM register
+%define %%ZT6           %18     ; [clobbered] ZMM register
+%define %%ZT7           %19     ; [clobbered] ZMM register
+%define %%ZT8           %20     ; [clobbered] ZMM register
+%define %%ZT9           %21     ; [clobbered] ZMM register
+
+        CALC_AAD_HASH   %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, \
+                        %%ZT0, %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%ZT9, \
+                        %%GPR1, %%GPR2, %%GPR3, %%MASKREG
+
+        mov             %%GPR1, %%A_LEN
+        vmovdqu64       [%%GDATA_CTX + AadHash], %%AAD_HASH   ; ctx.aad hash = aad_hash
+        mov             [%%GDATA_CTX + AadLen], %%GPR1        ; ctx.aad_length = aad_length
+
+        xor             %%GPR1, %%GPR1
+        mov             [%%GDATA_CTX + InLen], %%GPR1         ; ctx.in_length = 0
+        mov             [%%GDATA_CTX + PBlockLen], %%GPR1     ; ctx.partial_block_length = 0
+
+        ;; read 12 IV bytes and pad with 0x00000001
+        vmovdqu8        %%CUR_COUNT, [rel ONEf]
+        mov             %%GPR2, %%IV
+        mov             %%GPR1, 0x0000_0000_0000_0fff
+        kmovq           %%MASKREG, %%GPR1
+        vmovdqu8        %%CUR_COUNT{%%MASKREG}, [%%GPR2]      ; ctr = IV | 0x1
+
+        vmovdqu64       [%%GDATA_CTX + OrigIV], %%CUR_COUNT   ; ctx.orig_IV = iv
+
+        ;; store IV as counter in LE format
+        vpshufb         %%CUR_COUNT, [rel SHUF_MASK]
+        vmovdqu         [%%GDATA_CTX + CurCount], %%CUR_COUNT ; ctx.current_counter = iv
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Cipher and ghash of payloads shorter than 256 bytes
+;;; - number of blocks in the message comes as argument
+;;; - depending on the number of blocks an optimized variant of
+;;;   INITIAL_BLOCKS_PARTIAL is invoked
+%macro  GCM_ENC_DEC_SMALL   42
+%define %%GDATA_KEY         %1  ; [in] key pointer
+%define %%GDATA_CTX         %2  ; [in] context pointer
+%define %%CYPH_PLAIN_OUT    %3  ; [in] output buffer
+%define %%PLAIN_CYPH_IN     %4  ; [in] input buffer
+%define %%PLAIN_CYPH_LEN    %5  ; [in] buffer length
+%define %%ENC_DEC           %6  ; [in] cipher direction
+%define %%DATA_OFFSET       %7  ; [in] data offset
+%define %%LENGTH            %8  ; [in] data length
+%define %%NUM_BLOCKS        %9  ; [in] number of blocks to process 1 to 16
+%define %%CTR               %10 ; [in/out] XMM counter block
+%define %%HASH_IN_OUT       %11 ; [in/out] XMM GHASH value
+%define %%INSTANCE_TYPE     %12 ; [in] single or multi call
+%define %%ZTMP0             %13 ; [clobbered] ZMM register
+%define %%ZTMP1             %14 ; [clobbered] ZMM register
+%define %%ZTMP2             %15 ; [clobbered] ZMM register
+%define %%ZTMP3             %16 ; [clobbered] ZMM register
+%define %%ZTMP4             %17 ; [clobbered] ZMM register
+%define %%ZTMP5             %18 ; [clobbered] ZMM register
+%define %%ZTMP6             %19 ; [clobbered] ZMM register
+%define %%ZTMP7             %20 ; [clobbered] ZMM register
+%define %%ZTMP8             %21 ; [clobbered] ZMM register
+%define %%ZTMP9             %22 ; [clobbered] ZMM register
+%define %%ZTMP10            %23 ; [clobbered] ZMM register
+%define %%ZTMP11            %24 ; [clobbered] ZMM register
+%define %%ZTMP12            %25 ; [clobbered] ZMM register
+%define %%ZTMP13            %26 ; [clobbered] ZMM register
+%define %%ZTMP14            %27 ; [clobbered] ZMM register
+%define %%ZTMP15            %28 ; [clobbered] ZMM register
+%define %%ZTMP16            %29 ; [clobbered] ZMM register
+%define %%ZTMP17            %30 ; [clobbered] ZMM register
+%define %%ZTMP18            %31 ; [clobbered] ZMM register
+%define %%ZTMP19            %32 ; [clobbered] ZMM register
+%define %%ZTMP20            %33 ; [clobbered] ZMM register
+%define %%ZTMP21            %34 ; [clobbered] ZMM register
+%define %%ZTMP22            %35 ; [clobbered] ZMM register
+%define %%GH                %36 ; [in] ZMM ghash sum (high)
+%define %%GL                %37 ; [in] ZMM ghash sum (low)
+%define %%GM                %38 ; [in] ZMM ghash sum (middle)
+%define %%IA0               %39 ; [clobbered] GP register
+%define %%IA1               %40 ; [clobbered] GP register
+%define %%MASKREG           %41 ; [clobbered] mask register
+%define %%SHUFMASK          %42 ; [in] ZMM with BE/LE shuffle mask
+
+        cmp     %%NUM_BLOCKS, 8
+        je      %%_small_initial_num_blocks_is_8
+        jl      %%_small_initial_num_blocks_is_7_1
+
+
+        cmp     %%NUM_BLOCKS, 12
+        je      %%_small_initial_num_blocks_is_12
+        jl      %%_small_initial_num_blocks_is_11_9
+
+        ;; 16, 15, 14 or 13
+        cmp     %%NUM_BLOCKS, 16
+        je      %%_small_initial_num_blocks_is_16
+        cmp     %%NUM_BLOCKS, 15
+        je      %%_small_initial_num_blocks_is_15
+        cmp     %%NUM_BLOCKS, 14
+        je      %%_small_initial_num_blocks_is_14
+        jmp     %%_small_initial_num_blocks_is_13
+
+%%_small_initial_num_blocks_is_11_9:
+        ;; 11, 10 or 9
+        cmp     %%NUM_BLOCKS, 11
+        je      %%_small_initial_num_blocks_is_11
+        cmp     %%NUM_BLOCKS, 10
+        je      %%_small_initial_num_blocks_is_10
+        jmp     %%_small_initial_num_blocks_is_9
+
+%%_small_initial_num_blocks_is_7_1:
+        cmp     %%NUM_BLOCKS, 4
+        je      %%_small_initial_num_blocks_is_4
+        jl      %%_small_initial_num_blocks_is_3_1
+        ;; 7, 6 or 5
+        cmp     %%NUM_BLOCKS, 7
+        je      %%_small_initial_num_blocks_is_7
+        cmp     %%NUM_BLOCKS, 6
+        je      %%_small_initial_num_blocks_is_6
+        jmp     %%_small_initial_num_blocks_is_5
+
+%%_small_initial_num_blocks_is_3_1:
+        ;; 3, 2 or 1
+        cmp     %%NUM_BLOCKS, 3
+        je      %%_small_initial_num_blocks_is_3
+        cmp     %%NUM_BLOCKS, 2
+        je      %%_small_initial_num_blocks_is_2
+
+        ;; for %%NUM_BLOCKS == 1, just fall through and no 'jmp' needed
+
+        ;; Use rep to generate different block size variants
+        ;; - one block size has to be the first one
+%assign num_blocks 1
+%rep 16
+%%_small_initial_num_blocks_is_ %+ num_blocks :
+        INITIAL_BLOCKS_PARTIAL  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+                %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, num_blocks, \
+                %%CTR, %%HASH_IN_OUT, %%ENC_DEC, %%INSTANCE_TYPE, \
+                %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
+                %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, \
+                %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP14, \
+                %%ZTMP15, %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+                %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+                %%GH, %%GL, %%GM, \
+                %%IA0, %%IA1, %%MASKREG, %%SHUFMASK
+%if num_blocks != 16
+        jmp     %%_small_initial_blocks_encrypted
+%endif
+%assign num_blocks (num_blocks + 1)
+%endrep
+
+%%_small_initial_blocks_encrypted:
+
+%endmacro                       ; GCM_ENC_DEC_SMALL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
+; has been initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC).
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10-r15, and zmm0-zmm31, k1
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro  GCM_ENC_DEC         7
+%define %%GDATA_KEY         %1  ; [in] key pointer
+%define %%GDATA_CTX         %2  ; [in] context pointer
+%define %%CYPH_PLAIN_OUT    %3  ; [in] output buffer pointer
+%define %%PLAIN_CYPH_IN     %4  ; [in] input buffer pointer
+%define %%PLAIN_CYPH_LEN    %5  ; [in] buffer length
+%define %%ENC_DEC           %6  ; [in] cipher direction
+%define %%INSTANCE_TYPE     %7  ; [in] 'single_call' or 'multi_call' selection
+
+%define %%IA0               r10
+%define %%IA1               r12
+%define %%IA2               r13
+%define %%IA3               r15
+%define %%IA4               r11
+%define %%IA5               rax
+
+%define %%LENGTH            %%IA2
+%define %%CTR_CHECK         %%IA3
+%define %%DATA_OFFSET       %%IA4
+
+%define %%HASHK_PTR         %%IA5
+
+%define %%GCM_INIT_CTR_BLOCK    xmm2 ; hardcoded in GCM_INIT for now
+
+%define %%AES_PARTIAL_BLOCK     xmm8
+%define %%CTR_BLOCK2z           zmm18
+%define %%CTR_BLOCKz            zmm9
+%define %%CTR_BLOCKx            xmm9
+%define %%AAD_HASHz             zmm14
+%define %%AAD_HASHx             xmm14
+
+;;; ZTMP0 - ZTMP12 - used in by8 code, by128/48 code and GCM_ENC_DEC_SMALL
+%define %%ZTMP0                 zmm0
+%define %%ZTMP1                 zmm3
+%define %%ZTMP2                 zmm4
+%define %%ZTMP3                 zmm5
+%define %%ZTMP4                 zmm6
+%define %%ZTMP5                 zmm7
+%define %%ZTMP6                 zmm10
+%define %%ZTMP7                 zmm11
+%define %%ZTMP8                 zmm12
+%define %%ZTMP9                 zmm13
+%define %%ZTMP10                zmm15
+%define %%ZTMP11                zmm16
+%define %%ZTMP12                zmm17
+
+;;; ZTMP13 - ZTMP22 - used in by128/48 code and GCM_ENC_DEC_SMALL
+;;; - some used by8 code as well through TMPxy names
+%define %%ZTMP13                zmm19
+%define %%ZTMP14                zmm20
+%define %%ZTMP15                zmm21
+%define %%ZTMP16                zmm30   ; can be used in very/big_loop part
+%define %%ZTMP17                zmm31   ; can be used in very/big_loop part
+%define %%ZTMP18                zmm1
+%define %%ZTMP19                zmm2
+%define %%ZTMP20                zmm8
+%define %%ZTMP21                zmm22
+%define %%ZTMP22                zmm23
+
+;;; Free to use: zmm24 - zmm29
+;;; - used by by128/48 and by8
+%define %%GH                    zmm24
+%define %%GL                    zmm25
+%define %%GM                    zmm26
+%define %%SHUF_MASK             zmm29
+%define %%CTR_BLOCK_SAVE        zmm28
+
+;;; - used by by128/48 code only
+%define %%ADDBE_4x4             zmm27
+%define %%ADDBE_1234            zmm28       ; conflicts with CTR_BLOCK_SAVE
+
+;; used by8 code only
+%define %%GH4KEY                %%ZTMP17
+%define %%GH8KEY                %%ZTMP16
+%define %%BLK0                  %%ZTMP18
+%define %%BLK1                  %%ZTMP19
+%define %%ADD8BE                zmm27
+%define %%ADD8LE                %%ZTMP13
+
+%define %%MASKREG               k1
+
+%ifdef GCM_BIG_DATA
+;; reduction every 128 blocks, depth 32 blocks
+;; @note 128 blocks is the maximum capacity of the stack frame when
+;;       GCM_BIG_DATA is defined
+%assign very_big_loop_nblocks   128
+%assign very_big_loop_depth     32
+%endif
+
+;; reduction every 48 blocks, depth 32 blocks
+;; @note 48 blocks is the maximum capacity of the stack frame when
+;;       GCM_BIG_DATA is not defined
+%assign big_loop_nblocks        48
+%assign big_loop_depth          32
+
+;;; Macro flow:
+;;; - for message size bigger than very_big_loop_nblocks process data
+;;;   with "very_big_loop" parameters
+;;; - for message size bigger than big_loop_nblocks process data
+;;;   with "big_loop" parameters
+;;; - calculate the number of 16byte blocks in the message
+;;; - process (number of 16byte blocks) mod 8
+;;;   '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+;;; - process 8 16 byte blocks at a time until all are done in %%_encrypt_by_8_new
+
+%ifidn __OUTPUT_FORMAT__, win64
+        cmp             %%PLAIN_CYPH_LEN, 0
+%else
+        or              %%PLAIN_CYPH_LEN, %%PLAIN_CYPH_LEN
+%endif
+        je              %%_enc_dec_done
+
+        xor             %%DATA_OFFSET, %%DATA_OFFSET
+
+        ;; Update length of data processed
+%ifidn __OUTPUT_FORMAT__, win64
+        mov             %%IA0, %%PLAIN_CYPH_LEN
+        add             [%%GDATA_CTX + InLen], %%IA0
+%else
+        add             [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN
+%endif
+        vmovdqu64       %%AAD_HASHx, [%%GDATA_CTX + AadHash]
+
+%ifidn %%INSTANCE_TYPE, multi_call
+        ;; NOTE: partial block processing makes only sense for multi_call here.
+        ;; Used for the update flow - if there was a previous partial
+        ;; block fill the remaining bytes here.
+        PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+                %%PLAIN_CYPH_LEN, %%DATA_OFFSET, %%AAD_HASHx, %%ENC_DEC, \
+                %%IA0, %%IA1, %%IA2, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
+                %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, %%MASKREG
+%endif
+
+        ;;  lift counter block from GCM_INIT to here
+%ifidn %%INSTANCE_TYPE, single_call
+        vmovdqu64       %%CTR_BLOCKx, %%GCM_INIT_CTR_BLOCK
+%else
+        vmovdqu64       %%CTR_BLOCKx, [%%GDATA_CTX + CurCount]
+%endif
+
+        ;; Save the amount of data left to process in %%LENGTH
+        mov             %%LENGTH, %%PLAIN_CYPH_LEN
+%ifidn %%INSTANCE_TYPE, multi_call
+        ;; NOTE: %%DATA_OFFSET is zero in single_call case.
+        ;;      Consequently PLAIN_CYPH_LEN will never be zero after
+        ;;      %%DATA_OFFSET subtraction below.
+        ;; There may be no more data if it was consumed in the partial block.
+        sub             %%LENGTH, %%DATA_OFFSET
+        je              %%_enc_dec_done
+%endif                          ; %%INSTANCE_TYPE, multi_call
+
+        vmovdqa64       %%SHUF_MASK, [rel SHUF_MASK]
+        vmovdqa64       %%ADDBE_4x4, [rel ddq_addbe_4444]
+
+%ifdef GCM_BIG_DATA
+        vmovdqa64       %%ADDBE_1234, [rel ddq_addbe_1234]
+
+        cmp             %%LENGTH, (very_big_loop_nblocks * 16)
+        jl              %%_message_below_very_big_nblocks
+
+        INITIAL_BLOCKS_Nx16 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \
+                %%AAD_HASHz, %%CTR_BLOCKz, %%CTR_CHECK, \
+                %%ZTMP0,  %%ZTMP1,  %%ZTMP2,  %%ZTMP3,  \
+                %%ZTMP4,  %%ZTMP5,  %%ZTMP6,  %%ZTMP7,  \
+                %%ZTMP8,  %%ZTMP9,  %%ZTMP10, %%ZTMP11, \
+                %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+                %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+                %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+                %%GH, %%GL, %%GM, \
+                %%ADDBE_4x4, %%ADDBE_1234, \
+                %%SHUF_MASK, %%ENC_DEC, very_big_loop_nblocks, very_big_loop_depth
+
+        sub             %%LENGTH, (very_big_loop_nblocks * 16)
+        cmp             %%LENGTH, (very_big_loop_nblocks * 16)
+        jl              %%_no_more_very_big_nblocks
+
+%%_encrypt_very_big_nblocks:
+        GHASH_ENCRYPT_Nx16_PARALLEL \
+                %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \
+                %%CTR_BLOCKz, %%SHUF_MASK, \
+                %%ZTMP0,  %%ZTMP1,  %%ZTMP2,  %%ZTMP3,  \
+                %%ZTMP4,  %%ZTMP5,  %%ZTMP6,  %%ZTMP7,  \
+                %%ZTMP8,  %%ZTMP9,  %%ZTMP10, %%ZTMP11, \
+                %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+                %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+                %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+                %%GH, %%GL, %%GM, \
+                %%ADDBE_4x4, %%ADDBE_1234, %%AAD_HASHz, \
+                %%ENC_DEC, very_big_loop_nblocks, very_big_loop_depth, %%CTR_CHECK
+
+        sub             %%LENGTH, (very_big_loop_nblocks * 16)
+        cmp             %%LENGTH, (very_big_loop_nblocks * 16)
+        jge             %%_encrypt_very_big_nblocks
+
+%%_no_more_very_big_nblocks:
+        vpshufb         %%CTR_BLOCKx, XWORD(%%SHUF_MASK)
+        vmovdqa64       XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx
+
+        GHASH_LAST_Nx16 %%GDATA_KEY, %%AAD_HASHz, \
+                %%ZTMP0,  %%ZTMP1,  %%ZTMP2,  %%ZTMP3,  \
+                %%ZTMP4,  %%ZTMP5,  %%ZTMP6,  %%ZTMP7,  \
+                %%ZTMP8,  %%ZTMP9,  %%ZTMP10, %%ZTMP11, \
+                %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+                %%GH, %%GL, %%GM, very_big_loop_nblocks, very_big_loop_depth
+
+        or              %%LENGTH, %%LENGTH
+        jz              %%_ghash_done
+
+%%_message_below_very_big_nblocks:
+%endif          ; GCM_BIG_DATA
+
+        cmp             %%LENGTH, (big_loop_nblocks * 16)
+        jl              %%_message_below_big_nblocks
+
+        ;; overwritten above by CTR_BLOCK_SAVE
+        vmovdqa64        %%ADDBE_1234, [rel ddq_addbe_1234]
+
+        INITIAL_BLOCKS_Nx16 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \
+                %%AAD_HASHz, %%CTR_BLOCKz, %%CTR_CHECK, \
+                %%ZTMP0,  %%ZTMP1,  %%ZTMP2,  %%ZTMP3,  \
+                %%ZTMP4,  %%ZTMP5,  %%ZTMP6,  %%ZTMP7,  \
+                %%ZTMP8,  %%ZTMP9,  %%ZTMP10, %%ZTMP11, \
+                %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+                %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+                %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+                %%GH, %%GL, %%GM, \
+                %%ADDBE_4x4, %%ADDBE_1234, \
+                %%SHUF_MASK, %%ENC_DEC, big_loop_nblocks, big_loop_depth
+
+        sub             %%LENGTH, (big_loop_nblocks * 16)
+        cmp             %%LENGTH, (big_loop_nblocks * 16)
+        jl              %%_no_more_big_nblocks
+
+%%_encrypt_big_nblocks:
+        GHASH_ENCRYPT_Nx16_PARALLEL \
+                %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \
+                %%CTR_BLOCKz, %%SHUF_MASK, \
+                %%ZTMP0,  %%ZTMP1,  %%ZTMP2,  %%ZTMP3,  \
+                %%ZTMP4,  %%ZTMP5,  %%ZTMP6,  %%ZTMP7,  \
+                %%ZTMP8,  %%ZTMP9,  %%ZTMP10, %%ZTMP11, \
+                %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+                %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+                %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+                %%GH, %%GL, %%GM, \
+                %%ADDBE_4x4, %%ADDBE_1234, %%AAD_HASHz, \
+                %%ENC_DEC, big_loop_nblocks, big_loop_depth, %%CTR_CHECK
+
+        sub             %%LENGTH, (big_loop_nblocks * 16)
+        cmp             %%LENGTH, (big_loop_nblocks * 16)
+        jge             %%_encrypt_big_nblocks
+
+%%_no_more_big_nblocks:
+        vpshufb         %%CTR_BLOCKx, XWORD(%%SHUF_MASK)
+        vmovdqa64       XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx
+
+        GHASH_LAST_Nx16 %%GDATA_KEY, %%AAD_HASHz, \
+                %%ZTMP0,  %%ZTMP1,  %%ZTMP2,  %%ZTMP3,  \
+                %%ZTMP4,  %%ZTMP5,  %%ZTMP6,  %%ZTMP7,  \
+                %%ZTMP8,  %%ZTMP9,  %%ZTMP10, %%ZTMP11, \
+                %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+                %%GH, %%GL, %%GM, big_loop_nblocks, big_loop_depth
+
+        or              %%LENGTH, %%LENGTH
+        jz              %%_ghash_done
+
+%%_message_below_big_nblocks:
+
+        ;; Less than 256 bytes will be handled by the small message code, which
+        ;; can process up to 16 x blocks (16 bytes each)
+        cmp             %%LENGTH, (16 * 16)
+        jge             %%_large_message_path
+
+        ;; Determine how many blocks to process
+        ;; - process one additional block if there is a partial block
+        mov             %%IA1, %%LENGTH
+        add             %%IA1, 15
+        shr             %%IA1, 4
+        ;; %%IA1 can be in the range from 0 to 16
+
+        GCM_ENC_DEC_SMALL \
+                %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+                %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, \
+                %%LENGTH, %%IA1, %%CTR_BLOCKx, %%AAD_HASHx, %%INSTANCE_TYPE, \
+                %%ZTMP0,  %%ZTMP1,  %%ZTMP2,  %%ZTMP3,  \
+                %%ZTMP4,  %%ZTMP5,  %%ZTMP6,  %%ZTMP7,  \
+                %%ZTMP8,  %%ZTMP9,  %%ZTMP10, %%ZTMP11, \
+                %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+                %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+                %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+                no_zmm, no_zmm, no_zmm, \
+                %%IA0, %%IA3, %%MASKREG, %%SHUF_MASK
+
+        vmovdqa64       XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx
+
+        jmp     %%_ghash_done
+
+%%_large_message_path:
+        ;; Determine how many blocks to process in INITIAL
+        ;; - process one additional block in INITIAL if there is a partial block
+        mov             %%IA1, %%LENGTH
+        and             %%IA1, 0xff
+        add             %%IA1, 15
+        shr             %%IA1, 4
+        ;; Don't allow 8 INITIAL blocks since this will
+        ;; be handled by the x8 partial loop.
+        and             %%IA1, 7
+        je              %%_initial_num_blocks_is_0
+        cmp             %%IA1, 1
+        je              %%_initial_num_blocks_is_1
+        cmp             %%IA1, 2
+        je              %%_initial_num_blocks_is_2
+        cmp             %%IA1, 3
+        je              %%_initial_num_blocks_is_3
+        cmp             %%IA1, 4
+        je              %%_initial_num_blocks_is_4
+        cmp             %%IA1, 5
+        je              %%_initial_num_blocks_is_5
+        cmp             %%IA1, 6
+        je              %%_initial_num_blocks_is_6
+
+%assign number_of_blocks 7
+%rep 8
+%%_initial_num_blocks_is_ %+ number_of_blocks:
+        INITIAL_BLOCKS  %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+                %%LENGTH, %%DATA_OFFSET, number_of_blocks, %%CTR_BLOCKx, %%AAD_HASHz, \
+                %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
+                %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+                %%IA0, %%IA1, %%ENC_DEC, %%MASKREG, %%SHUF_MASK, no_partial_block
+%if number_of_blocks != 0
+        jmp             %%_initial_blocks_encrypted
+%endif
+%assign number_of_blocks (number_of_blocks - 1)
+%endrep
+
+%%_initial_blocks_encrypted:
+        vmovdqa64       XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx
+
+        ;; move cipher blocks from intial blocks to input of by8 macro
+        ;; and for GHASH_LAST_8/7
+        ;; - ghash value already xor'ed into block 0
+        vmovdqa64       %%BLK0, %%ZTMP0
+        vmovdqa64       %%BLK1, %%ZTMP1
+
+        ;; The entire message cannot get processed in INITIAL_BLOCKS
+        ;; - GCM_ENC_DEC_SMALL handles up to 16 blocks
+        ;; - INITIAL_BLOCKS processes up to 15 blocks
+        ;; - no need to check for zero length at this stage
+
+        ;; In order to have only one reduction at the end
+        ;; start HASH KEY pointer needs to be determined based on length and
+        ;; call type.
+        ;; - note that 8 blocks are already ciphered in INITIAL_BLOCKS and
+        ;;   subtracted from LENGTH
+        lea             %%IA1, [%%LENGTH + (8 * 16)]
+        add             %%IA1, 15
+        and             %%IA1, 0x3f0
+%ifidn %%INSTANCE_TYPE, multi_call
+        ;; if partial block and multi_call then change hash key start by one
+        mov             %%IA0, %%LENGTH
+        and             %%IA0, 15
+        add             %%IA0, 15
+        and             %%IA0, 16
+        sub             %%IA1, %%IA0
+%endif
+        lea             %%HASHK_PTR, [%%GDATA_KEY + HashKey + 16]
+        sub             %%HASHK_PTR, %%IA1
+        ;; HASHK_PTR
+        ;; - points at the first hash key to start GHASH with
+        ;; - needs to be updated as the message is processed (incremented)
+
+        ;; pre-load constants
+        vmovdqa64       %%ADD8BE, [rel ddq_addbe_8888]
+        vmovdqa64       %%ADD8LE, [rel ddq_add_8888]
+        vpxorq          %%GH, %%GH
+        vpxorq          %%GL, %%GL
+        vpxorq          %%GM, %%GM
+
+        ;; prepare counter 8 blocks
+        vshufi64x2      %%CTR_BLOCKz, %%CTR_BLOCKz, %%CTR_BLOCKz, 0
+        vpaddd          %%CTR_BLOCK2z, %%CTR_BLOCKz, [rel ddq_add_5678]
+        vpaddd          %%CTR_BLOCKz, %%CTR_BLOCKz, [rel ddq_add_1234]
+        vpshufb         %%CTR_BLOCKz,  %%SHUF_MASK
+        vpshufb         %%CTR_BLOCK2z, %%SHUF_MASK
+
+        ;; Process 7 full blocks plus a partial block
+        cmp             %%LENGTH, 128
+        jl              %%_encrypt_by_8_partial
+
+%%_encrypt_by_8_parallel:
+        ;; in_order vs. out_order is an optimization to increment the counter
+        ;; without shuffling it back into little endian.
+        ;; %%CTR_CHECK keeps track of when we need to increment in order so
+        ;; that the carry is handled correctly.
+
+        vmovq           %%CTR_CHECK, XWORD(%%CTR_BLOCK_SAVE)
+
+%%_encrypt_by_8_new:
+        and             WORD(%%CTR_CHECK), 255
+        add             WORD(%%CTR_CHECK), 8
+
+        vmovdqu64       %%GH4KEY, [%%HASHK_PTR + (4 * 16)]
+        vmovdqu64       %%GH8KEY, [%%HASHK_PTR + (0 * 16)]
+
+        GHASH_8_ENCRYPT_8_PARALLEL  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+                %%DATA_OFFSET, %%CTR_BLOCKz, %%CTR_BLOCK2z,\
+                %%BLK0, %%BLK1, %%AES_PARTIAL_BLOCK, \
+                out_order, %%ENC_DEC, full, %%IA0, %%IA1, %%LENGTH, %%INSTANCE_TYPE, \
+                %%GH4KEY, %%GH8KEY, %%SHUF_MASK, \
+                %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \
+                %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, %%ZTMP12, \
+                %%MASKREG, no_reduction, %%GL, %%GH, %%GM
+
+        add             %%HASHK_PTR, (8 * 16)
+        add             %%DATA_OFFSET, 128
+        sub             %%LENGTH, 128
+        jz              %%_encrypt_done
+
+        cmp             WORD(%%CTR_CHECK), (256 - 8)
+        jae             %%_encrypt_by_8
+
+        vpaddd          %%CTR_BLOCKz, %%ADD8BE
+        vpaddd          %%CTR_BLOCK2z, %%ADD8BE
+
+        cmp             %%LENGTH, 128
+        jl              %%_encrypt_by_8_partial
+
+        jmp             %%_encrypt_by_8_new
+
+%%_encrypt_by_8:
+        vpshufb         %%CTR_BLOCKz,  %%SHUF_MASK
+        vpshufb         %%CTR_BLOCK2z, %%SHUF_MASK
+        vpaddd          %%CTR_BLOCKz,  %%ADD8LE
+        vpaddd          %%CTR_BLOCK2z, %%ADD8LE
+        vpshufb         %%CTR_BLOCKz,  %%SHUF_MASK
+        vpshufb         %%CTR_BLOCK2z, %%SHUF_MASK
+
+        cmp             %%LENGTH, 128
+        jge             %%_encrypt_by_8_new
+
+%%_encrypt_by_8_partial:
+        ;; Test to see if we need a by 8 with partial block. At this point
+        ;; bytes remaining should be either zero or between 113-127.
+        ;; 'in_order' shuffle needed to align key for partial block xor.
+        ;; 'out_order' is a little faster because it avoids extra shuffles.
+        ;;  - counter blocks for the next 8 blocks are prepared and in BE format
+        ;;  - we can go ahead with out_order scenario
+
+        vmovdqu64       %%GH4KEY, [%%HASHK_PTR + (4 * 16)]
+        vmovdqu64       %%GH8KEY, [%%HASHK_PTR + (0 * 16)]
+
+        GHASH_8_ENCRYPT_8_PARALLEL  %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+                %%DATA_OFFSET, %%CTR_BLOCKz, %%CTR_BLOCK2z, \
+                %%BLK0, %%BLK1, %%AES_PARTIAL_BLOCK, \
+                out_order, %%ENC_DEC, partial, %%IA0, %%IA1, %%LENGTH, %%INSTANCE_TYPE, \
+                %%GH4KEY, %%GH8KEY, %%SHUF_MASK, \
+                %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \
+                %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, %%ZTMP12, \
+                %%MASKREG, no_reduction, %%GL, %%GH, %%GM
+
+        add             %%HASHK_PTR, (8 * 16)
+        add             %%DATA_OFFSET, (128 - 16)
+        sub             %%LENGTH, (128 - 16)
+
+%ifidn %%INSTANCE_TYPE, multi_call
+        mov             [%%GDATA_CTX + PBlockLen], %%LENGTH
+        vmovdqu64       [%%GDATA_CTX + PBlockEncKey], %%AES_PARTIAL_BLOCK
+%endif
+
+%%_encrypt_done:
+        ;; Extract the last counter block in LE format
+        vextracti32x4   XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCK2z, 3
+        vpshufb         XWORD(%%CTR_BLOCK_SAVE), XWORD(%%SHUF_MASK)
+
+        ;; GHASH last cipher text blocks in xmm1-xmm8
+        ;; - if block 8th is partial in a multi-call path then skip the block
+%ifidn %%INSTANCE_TYPE, multi_call
+        cmp             qword [%%GDATA_CTX + PBlockLen], 0
+        jz              %%_hash_last_8
+
+        ;; save the 8th partial block as GHASH_LAST_7 will clobber %%BLK1
+        vextracti32x4   XWORD(%%ZTMP7), %%BLK1, 3
+
+        GHASH_LAST_7 %%GDATA_KEY, %%BLK1, %%BLK0, \
+                %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \
+                %%AAD_HASHx, %%MASKREG, %%IA0, %%GH, %%GL, %%GM
+
+        ;; XOR the partial word into the hash
+        vpxorq          %%AAD_HASHx, %%AAD_HASHx, XWORD(%%ZTMP7)
+        jmp             %%_ghash_done
+%%_hash_last_8:
+%endif
+        GHASH_LAST_8 %%GDATA_KEY, %%BLK1, %%BLK0, \
+                %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%AAD_HASHx, \
+                %%GH, %%GL, %%GM
+%%_ghash_done:
+        vmovdqu64       [%%GDATA_CTX + CurCount], XWORD(%%CTR_BLOCK_SAVE)
+        vmovdqu64       [%%GDATA_CTX + AadHash], %%AAD_HASHx
+%%_enc_dec_done:
+
+%endmacro                       ; GCM_ENC_DEC
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Encrypt/decrypt the initial 16 blocks
+%macro INITIAL_BLOCKS_16 22
+%define %%IN            %1      ; [in] input buffer
+%define %%OUT           %2      ; [in] output buffer
+%define %%KP            %3      ; [in] pointer to expanded keys
+%define %%DATA_OFFSET   %4      ; [in] data offset
+%define %%GHASH         %5      ; [in] ZMM with AAD (low 128 bits)
+%define %%CTR           %6      ; [in] ZMM with CTR BE blocks 4x128 bits
+%define %%CTR_CHECK     %7      ; [in/out] GPR with counter overflow check
+%define %%ADDBE_4x4     %8      ; [in] ZMM 4x128bits with value 4 (big endian)
+%define %%ADDBE_1234    %9      ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian)
+%define %%T0            %10     ; [clobered] temporary ZMM register
+%define %%T1            %11     ; [clobered] temporary ZMM register
+%define %%T2            %12     ; [clobered] temporary ZMM register
+%define %%T3            %13     ; [clobered] temporary ZMM register
+%define %%T4            %14     ; [clobered] temporary ZMM register
+%define %%T5            %15     ; [clobered] temporary ZMM register
+%define %%T6            %16     ; [clobered] temporary ZMM register
+%define %%T7            %17     ; [clobered] temporary ZMM register
+%define %%T8            %18     ; [clobered] temporary ZMM register
+%define %%SHUF_MASK     %19     ; [in] ZMM with BE/LE shuffle mask
+%define %%ENC_DEC       %20     ; [in] ENC (encrypt) or DEC (decrypt) selector
+%define %%BLK_OFFSET    %21     ; [in] stack frame offset to ciphered blocks
+%define %%DATA_DISPL    %22     ; [in] fixed numerical data displacement/offset
+
+%define %%B00_03        %%T5
+%define %%B04_07        %%T6
+%define %%B08_11        %%T7
+%define %%B12_15        %%T8
+
+%assign stack_offset (%%BLK_OFFSET)
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        ;; prepare counter blocks
+
+        cmp             BYTE(%%CTR_CHECK), (256 - 16)
+        jae             %%_next_16_overflow
+        vpaddd          %%B00_03, %%CTR, %%ADDBE_1234
+        vpaddd          %%B04_07, %%B00_03, %%ADDBE_4x4
+        vpaddd          %%B08_11, %%B04_07, %%ADDBE_4x4
+        vpaddd          %%B12_15, %%B08_11, %%ADDBE_4x4
+        jmp             %%_next_16_ok
+%%_next_16_overflow:
+        vpshufb         %%CTR, %%CTR, %%SHUF_MASK
+        vmovdqa64       %%B12_15, [rel ddq_add_4444]
+        vpaddd          %%B00_03, %%CTR, [rel ddq_add_1234]
+        vpaddd          %%B04_07, %%B00_03, %%B12_15
+        vpaddd          %%B08_11, %%B04_07, %%B12_15
+        vpaddd          %%B12_15, %%B08_11, %%B12_15
+        vpshufb         %%B00_03, %%SHUF_MASK
+        vpshufb         %%B04_07, %%SHUF_MASK
+        vpshufb         %%B08_11, %%SHUF_MASK
+        vpshufb         %%B12_15, %%SHUF_MASK
+%%_next_16_ok:
+        vshufi64x2      %%CTR, %%B12_15, %%B12_15, 1111_1111b
+        add             BYTE(%%CTR_CHECK), 16
+
+        ;; === load 16 blocks of data
+        VX512LDR        %%T0, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*0)]
+        VX512LDR        %%T1, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*1)]
+        VX512LDR        %%T2, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*2)]
+        VX512LDR        %%T3, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*3)]
+
+        ;; move to AES encryption rounds
+%assign i 0
+        vbroadcastf64x2 %%T4, [%%KP + (16*i)]
+        vpxorq          %%B00_03, %%B00_03, %%T4
+        vpxorq          %%B04_07, %%B04_07, %%T4
+        vpxorq          %%B08_11, %%B08_11, %%T4
+        vpxorq          %%B12_15, %%B12_15, %%T4
+%assign i (i + 1)
+
+%rep NROUNDS
+        vbroadcastf64x2 %%T4, [%%KP + (16*i)]
+        vaesenc         %%B00_03, %%B00_03, %%T4
+        vaesenc         %%B04_07, %%B04_07, %%T4
+        vaesenc         %%B08_11, %%B08_11, %%T4
+        vaesenc         %%B12_15, %%B12_15, %%T4
+%assign i (i + 1)
+%endrep
+
+        vbroadcastf64x2 %%T4, [%%KP + (16*i)]
+        vaesenclast     %%B00_03, %%B00_03, %%T4
+        vaesenclast     %%B04_07, %%B04_07, %%T4
+        vaesenclast     %%B08_11, %%B08_11, %%T4
+        vaesenclast     %%B12_15, %%B12_15, %%T4
+
+        ;;  xor against text
+        vpxorq          %%B00_03, %%B00_03, %%T0
+        vpxorq          %%B04_07, %%B04_07, %%T1
+        vpxorq          %%B08_11, %%B08_11, %%T2
+        vpxorq          %%B12_15, %%B12_15, %%T3
+
+        ;; store
+        VX512STR        [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*0)], %%B00_03
+        VX512STR        [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*1)], %%B04_07
+        VX512STR        [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*2)], %%B08_11
+        VX512STR        [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*3)], %%B12_15
+
+%ifidn  %%ENC_DEC, DEC
+        ;; decryption - cipher text needs to go to GHASH phase
+        vpshufb         %%B00_03, %%T0, %%SHUF_MASK
+        vpshufb         %%B04_07, %%T1, %%SHUF_MASK
+        vpshufb         %%B08_11, %%T2, %%SHUF_MASK
+        vpshufb         %%B12_15, %%T3, %%SHUF_MASK
+%else
+        ;; encryption
+        vpshufb         %%B00_03, %%B00_03, %%SHUF_MASK
+        vpshufb         %%B04_07, %%B04_07, %%SHUF_MASK
+        vpshufb         %%B08_11, %%B08_11, %%SHUF_MASK
+        vpshufb         %%B12_15, %%B12_15, %%SHUF_MASK
+%endif
+
+%ifnidn %%GHASH, no_ghash
+        ;; === xor cipher block 0 with GHASH for the next GHASH round
+        vpxorq          %%B00_03, %%B00_03, %%GHASH
+%endif
+
+        vmovdqa64       [rsp + stack_offset + (0 * 64)], %%B00_03
+        vmovdqa64       [rsp + stack_offset + (1 * 64)], %%B04_07
+        vmovdqa64       [rsp + stack_offset + (2 * 64)], %%B08_11
+        vmovdqa64       [rsp + stack_offset + (3 * 64)], %%B12_15
+%endmacro                       ;INITIAL_BLOCKS_16
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Encrypt the initial N x 16 blocks
+;;; - A x 16 blocks are encrypted/decrypted first (pipeline depth)
+;;; - B x 16 blocks are encrypted/decrypted and previous A x 16 are ghashed
+;;; - A + B = N
+%macro INITIAL_BLOCKS_Nx16 39
+%define %%IN            %1      ; [in] input buffer
+%define %%OUT           %2      ; [in] output buffer
+%define %%KP            %3      ; [in] pointer to expanded keys
+%define %%DATA_OFFSET   %4      ; [in/out] data offset
+%define %%GHASH         %5      ; [in] ZMM with AAD (low 128 bits)
+%define %%CTR           %6      ; [in/out] ZMM with CTR: in - LE & 128b; out - BE & 4x128b
+%define %%CTR_CHECK     %7      ; [in/out] GPR with counter overflow check
+%define %%T0            %8      ; [clobered] temporary ZMM register
+%define %%T1            %9      ; [clobered] temporary ZMM register
+%define %%T2            %10     ; [clobered] temporary ZMM register
+%define %%T3            %11     ; [clobered] temporary ZMM register
+%define %%T4            %12     ; [clobered] temporary ZMM register
+%define %%T5            %13     ; [clobered] temporary ZMM register
+%define %%T6            %14     ; [clobered] temporary ZMM register
+%define %%T7            %15     ; [clobered] temporary ZMM register
+%define %%T8            %16     ; [clobered] temporary ZMM register
+%define %%T9            %17     ; [clobered] temporary ZMM register
+%define %%T10           %18     ; [clobered] temporary ZMM register
+%define %%T11           %19     ; [clobered] temporary ZMM register
+%define %%T12           %20     ; [clobered] temporary ZMM register
+%define %%T13           %21     ; [clobered] temporary ZMM register
+%define %%T14           %22     ; [clobered] temporary ZMM register
+%define %%T15           %23     ; [clobered] temporary ZMM register
+%define %%T16           %24     ; [clobered] temporary ZMM register
+%define %%T17           %25     ; [clobered] temporary ZMM register
+%define %%T18           %26     ; [clobered] temporary ZMM register
+%define %%T19           %27     ; [clobered] temporary ZMM register
+%define %%T20           %28     ; [clobered] temporary ZMM register
+%define %%T21           %29     ; [clobered] temporary ZMM register
+%define %%T22           %30     ; [clobered] temporary ZMM register
+%define %%GH            %31     ; [out] ZMM ghash sum (high)
+%define %%GL            %32     ; [out] ZMM ghash sum (low)
+%define %%GM            %33     ; [out] ZMM ghash sum (middle)
+%define %%ADDBE_4x4     %34     ; [in] ZMM 4x128bits with value 4 (big endian)
+%define %%ADDBE_1234    %35     ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian)
+%define %%SHUF_MASK     %36     ; [in] ZMM with BE/LE shuffle mask
+%define %%ENC_DEC       %37     ; [in] ENC (encrypt) or DEC (decrypt) selector
+%define %%NBLOCKS       %38     ; [in] number of blocks: multiple of 16
+%define %%DEPTH_BLK     %39     ; [in] pipline depth, number of blocks (mulitple of 16)
+
+%assign aesout_offset (STACK_LOCAL_OFFSET + (0 * 16))
+%assign ghashin_offset (STACK_LOCAL_OFFSET + (0 * 16))
+%assign hkey_offset HashKey_ %+ %%NBLOCKS
+%assign data_in_out_offset 0
+
+        ;; set up CTR_CHECK
+        vmovd           DWORD(%%CTR_CHECK), XWORD(%%CTR)
+        and             DWORD(%%CTR_CHECK), 255
+
+        ;; in LE format after init, convert to BE
+        vshufi64x2      %%CTR, %%CTR, %%CTR, 0
+        vpshufb         %%CTR, %%CTR, %%SHUF_MASK
+
+        ;; ==== AES lead in
+
+        ;; first 16 blocks - just cipher
+        INITIAL_BLOCKS_16       %%IN, %%OUT, %%KP, %%DATA_OFFSET, \
+                                %%GHASH, %%CTR, %%CTR_CHECK, %%ADDBE_4x4, %%ADDBE_1234, \
+                                %%T0, %%T1, %%T2, %%T3, %%T4, \
+                                %%T5, %%T6, %%T7, %%T8, \
+                                %%SHUF_MASK, %%ENC_DEC, aesout_offset, data_in_out_offset
+
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+
+%if (%%DEPTH_BLK > 16)
+%rep ((%%DEPTH_BLK - 16) / 16)
+        INITIAL_BLOCKS_16       %%IN, %%OUT, %%KP, %%DATA_OFFSET, \
+                                no_ghash, %%CTR, %%CTR_CHECK, %%ADDBE_4x4, %%ADDBE_1234, \
+                                %%T0, %%T1, %%T2, %%T3, %%T4, \
+                                %%T5, %%T6, %%T7, %%T8, \
+                                %%SHUF_MASK, %%ENC_DEC, aesout_offset, data_in_out_offset
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+%endrep
+%endif
+
+        ;; ==== GHASH + AES follows
+
+        ;; first 16 blocks stitched
+        GHASH_16_ENCRYPT_16_PARALLEL  %%KP, %%OUT, %%IN, %%DATA_OFFSET, \
+                %%CTR, %%CTR_CHECK, \
+                hkey_offset, aesout_offset, ghashin_offset, %%SHUF_MASK, \
+                %%T0,  %%T1,  %%T2,  %%T3, \
+                %%T4,  %%T5,  %%T6,  %%T7, \
+                %%T8,  %%T9,  %%T10, %%T11,\
+                %%T12, %%T13, %%T14, %%T15,\
+                %%T16, %%T17, %%T18, %%T19, \
+                %%T20, %%T21, %%T22, \
+                %%ADDBE_4x4, %%ADDBE_1234, \
+                %%GL, %%GH, %%GM, \
+                first_time, %%ENC_DEC, data_in_out_offset, no_ghash_in
+
+%if ((%%NBLOCKS - %%DEPTH_BLK) > 16)
+%rep ((%%NBLOCKS - %%DEPTH_BLK - 16) / 16)
+%assign ghashin_offset (ghashin_offset + (16 * 16))
+%assign hkey_offset (hkey_offset + (16 * 16))
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+
+        ;; mid 16 blocks - stitched
+        GHASH_16_ENCRYPT_16_PARALLEL  %%KP, %%OUT, %%IN, %%DATA_OFFSET, \
+                %%CTR, %%CTR_CHECK, \
+                hkey_offset, aesout_offset, ghashin_offset, %%SHUF_MASK, \
+                %%T0,  %%T1,  %%T2,  %%T3, \
+                %%T4,  %%T5,  %%T6,  %%T7, \
+                %%T8,  %%T9,  %%T10, %%T11,\
+                %%T12, %%T13, %%T14, %%T15,\
+                %%T16, %%T17, %%T18, %%T19, \
+                %%T20, %%T21, %%T22, \
+                %%ADDBE_4x4, %%ADDBE_1234, \
+                %%GL, %%GH, %%GM, \
+                no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in
+%endrep
+%endif
+        add             %%DATA_OFFSET, (%%NBLOCKS * 16)
+
+%endmacro                       ;INITIAL_BLOCKS_Nx16
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; GHASH the last 16 blocks of cipher text (last part of by 32/64/128 code)
+%macro  GHASH_LAST_Nx16 23
+%define %%KP            %1      ; [in] pointer to expanded keys
+%define %%GHASH         %2      ; [out] ghash output
+%define %%T1            %3      ; [clobbered] temporary ZMM
+%define %%T2            %4      ; [clobbered] temporary ZMM
+%define %%T3            %5      ; [clobbered] temporary ZMM
+%define %%T4            %6      ; [clobbered] temporary ZMM
+%define %%T5            %7      ; [clobbered] temporary ZMM
+%define %%T6            %8      ; [clobbered] temporary ZMM
+%define %%T7            %9      ; [clobbered] temporary ZMM
+%define %%T8            %10     ; [clobbered] temporary ZMM
+%define %%T9            %11     ; [clobbered] temporary ZMM
+%define %%T10           %12     ; [clobbered] temporary ZMM
+%define %%T11           %13     ; [clobbered] temporary ZMM
+%define %%T12           %14     ; [clobbered] temporary ZMM
+%define %%T13           %15     ; [clobbered] temporary ZMM
+%define %%T14           %16     ; [clobbered] temporary ZMM
+%define %%T15           %17     ; [clobbered] temporary ZMM
+%define %%T16           %18     ; [clobbered] temporary ZMM
+%define %%GH            %19     ; [in/cloberred] ghash sum (high)
+%define %%GL            %20     ; [in/cloberred] ghash sum (low)
+%define %%GM            %21     ; [in/cloberred] ghash sum (medium)
+%define %%LOOP_BLK      %22     ; [in] numerical number of blocks handled by the loop
+%define %%DEPTH_BLK     %23     ; [in] numerical number, pipeline depth (ghash vs aes)
+
+%define %%T0H           %%T1
+%define %%T0L           %%T2
+%define %%T0M1          %%T3
+%define %%T0M2          %%T4
+
+%define %%T1H           %%T5
+%define %%T1L           %%T6
+%define %%T1M1          %%T7
+%define %%T1M2          %%T8
+
+%define %%T2H           %%T9
+%define %%T2L           %%T10
+%define %%T2M1          %%T11
+%define %%T2M2          %%T12
+
+%define %%BLK1          %%T13
+%define %%BLK2          %%T14
+
+%define %%HK1           %%T15
+%define %%HK2           %%T16
+
+%assign hashk      HashKey_ %+ %%DEPTH_BLK
+%assign cipher_blk (STACK_LOCAL_OFFSET + ((%%LOOP_BLK - %%DEPTH_BLK) * 16))
+
+        ;; load cipher blocks and ghash keys
+        vmovdqa64       %%BLK1, [rsp + cipher_blk]
+        vmovdqa64       %%BLK2, [rsp + cipher_blk + 64]
+        vmovdqu64       %%HK1, [%%KP + hashk]
+        vmovdqu64       %%HK2, [%%KP + hashk + 64]
+        ;; ghash blocks 0-3
+        vpclmulqdq      %%T0H, %%BLK1, %%HK1, 0x11      ; %%TH = a1*b1
+        vpclmulqdq      %%T0L, %%BLK1, %%HK1, 0x00      ; %%TL = a0*b0
+        vpclmulqdq      %%T0M1, %%BLK1, %%HK1, 0x01     ; %%TM1 = a1*b0
+        vpclmulqdq      %%T0M2, %%BLK1, %%HK1, 0x10     ; %%TM2 = a0*b1
+        ;; ghash blocks 4-7
+        vpclmulqdq      %%T1H, %%BLK2, %%HK2, 0x11      ; %%TTH = a1*b1
+        vpclmulqdq      %%T1L, %%BLK2, %%HK2, 0x00      ; %%TTL = a0*b0
+        vpclmulqdq      %%T1M1, %%BLK2, %%HK2, 0x01     ; %%TTM1 = a1*b0
+        vpclmulqdq      %%T1M2, %%BLK2, %%HK2, 0x10     ; %%TTM2 = a0*b1
+        vpternlogq      %%T0H, %%T1H, %%GH, 0x96        ; T0H = T0H + T1H + GH
+        vpternlogq      %%T0L, %%T1L, %%GL, 0x96        ; T0L = T0L + T1L + GL
+        vpternlogq      %%T0M1, %%T1M1, %%GM, 0x96      ; T0M1 = T0M1 + T1M1 + GM
+        vpxorq          %%T0M2, %%T0M2, %%T1M2          ; T0M2 = T0M2 + T1M2
+
+%rep ((%%DEPTH_BLK - 8) / 8)
+%assign hashk      (hashk + 128)
+%assign cipher_blk (cipher_blk + 128)
+
+        ;; remaining blocks
+        ;; load next 8 cipher blocks and corresponding ghash keys
+        vmovdqa64       %%BLK1, [rsp + cipher_blk]
+        vmovdqa64       %%BLK2, [rsp + cipher_blk + 64]
+        vmovdqu64       %%HK1, [%%KP + hashk]
+        vmovdqu64       %%HK2, [%%KP + hashk + 64]
+        ;; ghash blocks 0-3
+        vpclmulqdq      %%T1H, %%BLK1, %%HK1, 0x11      ; %%TH = a1*b1
+        vpclmulqdq      %%T1L, %%BLK1, %%HK1, 0x00      ; %%TL = a0*b0
+        vpclmulqdq      %%T1M1, %%BLK1, %%HK1, 0x01     ; %%TM1 = a1*b0
+        vpclmulqdq      %%T1M2, %%BLK1, %%HK1, 0x10     ; %%TM2 = a0*b1
+        ;; ghash blocks 4-7
+        vpclmulqdq      %%T2H, %%BLK2, %%HK2, 0x11      ; %%TTH = a1*b1
+        vpclmulqdq      %%T2L, %%BLK2, %%HK2, 0x00      ; %%TTL = a0*b0
+        vpclmulqdq      %%T2M1, %%BLK2, %%HK2, 0x01     ; %%TTM1 = a1*b0
+        vpclmulqdq      %%T2M2, %%BLK2, %%HK2, 0x10     ; %%TTM2 = a0*b1
+        ;; update sums
+        vpternlogq      %%T0H, %%T1H, %%T2H, 0x96       ; TH = T0H + T1H + T2H
+        vpternlogq      %%T0L, %%T1L, %%T2L, 0x96       ; TL = T0L + T1L + T2L
+        vpternlogq      %%T0M1, %%T1M1, %%T2M1, 0x96    ; TM1 = T0M1 + T1M1 xor T2M1
+        vpternlogq      %%T0M2, %%T1M2, %%T2M2, 0x96    ; TM2 = T0M2 + T1M1 xor T2M2
+%endrep
+
+        ;; integrate TM into TH and TL
+        vpxorq          %%T0M1, %%T0M1, %%T0M2
+        vpsrldq         %%T1M1, %%T0M1, 8
+        vpslldq         %%T1M2, %%T0M1, 8
+        vpxorq          %%T0H, %%T0H, %%T1M1
+        vpxorq          %%T0L, %%T0L, %%T1M2
+
+        ;; add TH and TL 128-bit words horizontally
+        VHPXORI4x128    %%T0H, %%T2M1
+        VHPXORI4x128    %%T0L, %%T2M2
+
+        ;; reduction
+        vmovdqa64       %%HK1, [rel POLY2]
+        VCLMUL_REDUCE   %%GHASH, %%HK1, %%T0H, %%T0L, %%T0M1, %%T0M2
+%endmacro
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Encrypt & ghash multiples of 16 blocks
+
+%macro GHASH_ENCRYPT_Nx16_PARALLEL 39
+%define %%IN                    %1      ; [in] input buffer
+%define %%OUT                   %2      ; [in] output buffer
+%define %%GDATA_KEY             %3      ; [in] pointer to expanded keys
+%define %%DATA_OFFSET           %4      ; [in/out] data offset
+%define %%CTR_BE                %5      ; [in/out] ZMM last counter block
+%define %%SHFMSK                %6      ; [in] ZMM with byte swap mask for pshufb
+%define %%ZT0                   %7      ; [clobered] temporary ZMM register
+%define %%ZT1                   %8      ; [clobered] temporary ZMM register
+%define %%ZT2                   %9      ; [clobered] temporary ZMM register
+%define %%ZT3                   %10     ; [clobered] temporary ZMM register
+%define %%ZT4                   %11     ; [clobered] temporary ZMM register
+%define %%ZT5                   %12     ; [clobered] temporary ZMM register
+%define %%ZT6                   %13     ; [clobered] temporary ZMM register
+%define %%ZT7                   %14     ; [clobered] temporary ZMM register
+%define %%ZT8                   %15     ; [clobered] temporary ZMM register
+%define %%ZT9                   %16     ; [clobered] temporary ZMM register
+%define %%ZT10                  %17     ; [clobered] temporary ZMM register
+%define %%ZT11                  %18     ; [clobered] temporary ZMM register
+%define %%ZT12                  %19     ; [clobered] temporary ZMM register
+%define %%ZT13                  %20     ; [clobered] temporary ZMM register
+%define %%ZT14                  %21     ; [clobered] temporary ZMM register
+%define %%ZT15                  %22     ; [clobered] temporary ZMM register
+%define %%ZT16                  %23     ; [clobered] temporary ZMM register
+%define %%ZT17                  %24     ; [clobered] temporary ZMM register
+%define %%ZT18                  %25     ; [clobered] temporary ZMM register
+%define %%ZT19                  %26     ; [clobered] temporary ZMM register
+%define %%ZT20                  %27     ; [clobered] temporary ZMM register
+%define %%ZT21                  %28     ; [clobered] temporary ZMM register
+%define %%ZT22                  %29     ; [clobered] temporary ZMM register
+%define %%GTH                   %30     ; [in/out] ZMM GHASH sum (high)
+%define %%GTL                   %31     ; [in/out] ZMM GHASH sum (low)
+%define %%GTM                   %32     ; [in/out] ZMM GHASH sum (medium)
+%define %%ADDBE_4x4             %33     ; [in] ZMM 4x128bits with value 4 (big endian)
+%define %%ADDBE_1234            %34     ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian)
+%define %%GHASH                 %35     ; [clobbered] ZMM with intermidiate GHASH value
+%define %%ENC_DEC               %36     ; [in] ENC (encrypt) or DEC (decrypt) selector
+%define %%NUM_BLOCKS            %37     ; [in] number of blocks to process in the loop
+%define %%DEPTH_BLK             %38     ; [in] pipeline depth in blocks
+%define %%CTR_CHECK             %39     ; [in/out] counter to check byte overflow
+
+%assign aesout_offset (STACK_LOCAL_OFFSET + (0 * 16))
+%assign ghashin_offset (STACK_LOCAL_OFFSET + ((%%NUM_BLOCKS - %%DEPTH_BLK) * 16))
+%assign hkey_offset  HashKey_ %+ %%DEPTH_BLK
+%assign data_in_out_offset 0
+
+        ;; mid 16 blocks
+%if (%%DEPTH_BLK > 16)
+%rep ((%%DEPTH_BLK - 16) / 16)
+        GHASH_16_ENCRYPT_16_PARALLEL  %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \
+                %%CTR_BE, %%CTR_CHECK, \
+                hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \
+                %%ZT0,  %%ZT1,  %%ZT2,  %%ZT3, \
+                %%ZT4,  %%ZT5,  %%ZT6,  %%ZT7, \
+                %%ZT8,  %%ZT9,  %%ZT10, %%ZT11,\
+                %%ZT12, %%ZT13, %%ZT14, %%ZT15,\
+                %%ZT16, %%ZT17, %%ZT18, %%ZT19, \
+                %%ZT20, %%ZT21, %%ZT22, \
+                %%ADDBE_4x4, %%ADDBE_1234, \
+                %%GTL, %%GTH, %%GTM, \
+                no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in
+
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign ghashin_offset (ghashin_offset + (16 * 16))
+%assign hkey_offset (hkey_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+%endrep
+%endif
+
+        ;; 16 blocks with reduction
+        GHASH_16_ENCRYPT_16_PARALLEL  %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \
+                %%CTR_BE, %%CTR_CHECK, \
+                HashKey_16, aesout_offset, ghashin_offset, %%SHFMSK, \
+                %%ZT0,  %%ZT1,  %%ZT2,  %%ZT3, \
+                %%ZT4,  %%ZT5,  %%ZT6,  %%ZT7, \
+                %%ZT8,  %%ZT9,  %%ZT10, %%ZT11,\
+                %%ZT12, %%ZT13, %%ZT14, %%ZT15,\
+                %%ZT16, %%ZT17, %%ZT18, %%ZT19, \
+                %%ZT20, %%ZT21, %%ZT22, \
+                %%ADDBE_4x4, %%ADDBE_1234, \
+                %%GTL, %%GTH, %%GTM, \
+                final_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in
+
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+%assign ghashin_offset (STACK_LOCAL_OFFSET + (0 * 16))
+%assign hkey_offset HashKey_ %+ %%NUM_BLOCKS
+
+        ;; === xor cipher block 0 with GHASH (ZT4)
+        vmovdqa64        %%GHASH, %%ZT4
+
+        ;; start the pipeline again
+        GHASH_16_ENCRYPT_16_PARALLEL  %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \
+                %%CTR_BE, %%CTR_CHECK, \
+                hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \
+                %%ZT0,  %%ZT1,  %%ZT2,  %%ZT3, \
+                %%ZT4,  %%ZT5,  %%ZT6,  %%ZT7, \
+                %%ZT8,  %%ZT9,  %%ZT10, %%ZT11,\
+                %%ZT12, %%ZT13, %%ZT14, %%ZT15,\
+                %%ZT16, %%ZT17, %%ZT18, %%ZT19, \
+                %%ZT20, %%ZT21, %%ZT22, \
+                %%ADDBE_4x4, %%ADDBE_1234, \
+                %%GTL, %%GTH, %%GTM, \
+                first_time, %%ENC_DEC, data_in_out_offset, %%GHASH
+
+%if ((%%NUM_BLOCKS - %%DEPTH_BLK) > 16)
+%rep ((%%NUM_BLOCKS - %%DEPTH_BLK - 16 ) / 16)
+
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+%assign ghashin_offset (ghashin_offset + (16 * 16))
+%assign hkey_offset (hkey_offset + (16 * 16))
+
+        GHASH_16_ENCRYPT_16_PARALLEL  %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \
+                %%CTR_BE, %%CTR_CHECK, \
+                hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \
+                %%ZT0,  %%ZT1,  %%ZT2,  %%ZT3, \
+                %%ZT4,  %%ZT5,  %%ZT6,  %%ZT7, \
+                %%ZT8,  %%ZT9,  %%ZT10, %%ZT11,\
+                %%ZT12, %%ZT13, %%ZT14, %%ZT15,\
+                %%ZT16, %%ZT17, %%ZT18, %%ZT19, \
+                %%ZT20, %%ZT21, %%ZT22, \
+                %%ADDBE_4x4, %%ADDBE_1234, \
+                %%GTL, %%GTH, %%GTM, \
+                no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in
+%endrep
+%endif
+
+        add     %%DATA_OFFSET, (%%NUM_BLOCKS * 16)
+
+%endmacro                       ;GHASH_ENCRYPT_Nx16_PARALLEL
+;;; ===========================================================================
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro  GCM_COMPLETE            6
+%define %%GDATA_KEY             %1
+%define %%GDATA_CTX             %2
+%define %%AUTH_TAG              %3
+%define %%AUTH_TAG_LEN          %4
+%define %%ENC_DEC               %5
+%define %%INSTANCE_TYPE         %6
+%define %%PLAIN_CYPH_LEN        rax
+
+        vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+        ;; Start AES as early as possible
+        vmovdqu xmm9, [%%GDATA_CTX + OrigIV]    ; xmm9 = Y0
+        ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9  ; E(K, Y0)
+
+%ifidn %%INSTANCE_TYPE, multi_call
+        ;; If the GCM function is called as a single function call rather
+        ;; than invoking the individual parts (init, update, finalize) we
+        ;; can remove a write to read dependency on AadHash.
+        vmovdqu xmm14, [%%GDATA_CTX + AadHash]
+
+        ;; Encrypt the final partial block. If we did this as a single call then
+        ;; the partial block was handled in the main GCM_ENC_DEC macro.
+        mov     r12, [%%GDATA_CTX + PBlockLen]
+        cmp     r12, 0
+
+        je %%_partial_done
+
+        GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+        vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+%%_partial_done:
+
+%endif
+
+        mov     r12, [%%GDATA_CTX + AadLen]     ; r12 = aadLen (number of bytes)
+        mov     %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
+
+        shl     r12, 3                      ; convert into number of bits
+        vmovd   xmm15, r12d                 ; len(A) in xmm15
+
+        shl     %%PLAIN_CYPH_LEN, 3         ; len(C) in bits  (*128)
+        vmovq   xmm1, %%PLAIN_CYPH_LEN
+        vpslldq xmm15, xmm15, 8             ; xmm15 = len(A)|| 0x0000000000000000
+        vpxor   xmm15, xmm15, xmm1          ; xmm15 = len(A)||len(C)
+
+        vpxor   xmm14, xmm15
+        GHASH_MUL       xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6
+        vpshufb  xmm14, [rel SHUF_MASK]         ; perform a 16Byte swap
+
+        vpxor   xmm9, xmm9, xmm14
+
+
+%%_return_T:
+        mov     r10, %%AUTH_TAG             ; r10 = authTag
+        mov     r11, %%AUTH_TAG_LEN         ; r11 = auth_tag_len
+
+        cmp     r11, 16
+        je      %%_T_16
+
+        cmp     r11, 12
+        je      %%_T_12
+
+        cmp     r11, 8
+        je      %%_T_8
+
+        simd_store_avx_15 r10, xmm9, r11, r12, rax
+        jmp     %%_return_T_done
+%%_T_8:
+        vmovq    rax, xmm9
+        mov     [r10], rax
+        jmp     %%_return_T_done
+%%_T_12:
+        vmovq    rax, xmm9
+        mov     [r10], rax
+        vpsrldq xmm9, xmm9, 8
+        vmovd    eax, xmm9
+        mov     [r10 + 8], eax
+        jmp     %%_return_T_done
+%%_T_16:
+        vmovdqu  [r10], xmm9
+
+%%_return_T_done:
+
+%ifdef SAFE_DATA
+        ;; Clear sensitive data from context structure
+        vpxor   xmm0, xmm0
+        vmovdqu [%%GDATA_CTX + AadHash], xmm0
+        vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm0
+%endif
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_precomp_128_vaes_avx512 /
+;       aes_gcm_precomp_192_vaes_avx512 /
+;       aes_gcm_precomp_256_vaes_avx512
+;       (struct gcm_key_data *key_data)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(precomp,_)
+FN_NAME(precomp,_):
+	endbranch
+;; Parameter is passed through register
+%ifdef SAFE_PARAM
+        ;; Check key_data != NULL
+        cmp     arg1, 0
+        jz      exit_precomp
+%endif
+
+        FUNC_SAVE
+
+        vpxor   xmm6, xmm6
+        ENCRYPT_SINGLE_BLOCK    arg1, xmm6              ; xmm6 = HashKey
+
+        vpshufb  xmm6, [rel SHUF_MASK]
+        ;;;;;;;;;;;;;;;  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+        vmovdqa  xmm2, xmm6
+        vpsllq   xmm6, xmm6, 1
+        vpsrlq   xmm2, xmm2, 63
+        vmovdqa  xmm1, xmm2
+        vpslldq  xmm2, xmm2, 8
+        vpsrldq  xmm1, xmm1, 8
+        vpor     xmm6, xmm6, xmm2
+        ;reduction
+        vpshufd  xmm2, xmm1, 00100100b
+        vpcmpeqd xmm2, [rel TWOONE]
+        vpand    xmm2, xmm2, [rel POLY]
+        vpxor    xmm6, xmm6, xmm2                       ; xmm6 holds the HashKey<<1 mod poly
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+        vmovdqu  [arg1 + HashKey], xmm6                 ; store HashKey<<1 mod poly
+
+
+        PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+        FUNC_RESTORE
+exit_precomp:
+
+        ret
+%endif	; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_init_128_vaes_avx512 / aes_gcm_init_192_vaes_avx512 / aes_gcm_init_256_vaes_avx512
+;       (const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8       *iv,
+;        const u8 *aad,
+;        u64      aad_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(init,_)
+FN_NAME(init,_):
+	endbranch
+        FUNC_SAVE
+
+%ifdef SAFE_PARAM
+        ;; Check key_data != NULL
+        cmp     arg1, 0
+        jz      exit_init
+
+        ;; Check context_data != NULL
+        cmp     arg2, 0
+        jz      exit_init
+
+        ;; Check IV != NULL
+        cmp     arg3, 0
+        jz      exit_init
+
+        ;; Check if aad_len == 0
+        cmp     arg5, 0
+        jz      skip_aad_check_init
+
+        ;; Check aad != NULL (aad_len != 0)
+        cmp     arg4, 0
+        jz      exit_init
+
+skip_aad_check_init:
+%endif
+        GCM_INIT arg1, arg2, arg3, arg4, arg5, r10, r11, r12, k1, xmm14, xmm2, \
+                zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10
+
+exit_init:
+
+        FUNC_RESTORE
+        ret
+%endif	; _nt
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_enc_128_update_vaes_avx512 / aes_gcm_enc_192_update_vaes_avx512 /
+;       aes_gcm_enc_256_update_vaes_avx512
+;       (const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8       *out,
+;        const u8 *in,
+;        u64      plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(enc,_update_)
+FN_NAME(enc,_update_):
+	endbranch
+        FUNC_SAVE
+
+%ifdef SAFE_PARAM
+        ;; Check key_data != NULL
+        cmp     arg1, 0
+        jz      exit_update_enc
+
+        ;; Check context_data != NULL
+        cmp     arg2, 0
+        jz      exit_update_enc
+
+        ;; Check if plaintext_len == 0
+        cmp     arg5, 0
+        jz      skip_in_out_check_update_enc
+
+        ;; Check out != NULL (plaintext_len != 0)
+        cmp     arg3, 0
+        jz      exit_update_enc
+
+        ;; Check in != NULL (plaintext_len != 0)
+        cmp     arg4, 0
+        jz      exit_update_enc
+
+skip_in_out_check_update_enc:
+%endif
+        GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call
+
+exit_update_enc:
+        FUNC_RESTORE
+        ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_dec_128_update_vaes_avx512 / aes_gcm_dec_192_update_vaes_avx512 /
+;       aes_gcm_dec_256_update_vaes_avx512
+;       (const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8       *out,
+;        const u8 *in,
+;        u64      plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(dec,_update_)
+FN_NAME(dec,_update_):
+	endbranch
+        FUNC_SAVE
+
+%ifdef SAFE_PARAM
+        ;; Check key_data != NULL
+        cmp     arg1, 0
+        jz      exit_update_dec
+
+        ;; Check context_data != NULL
+        cmp     arg2, 0
+        jz      exit_update_dec
+
+        ;; Check if plaintext_len == 0
+        cmp     arg5, 0
+        jz      skip_in_out_check_update_dec
+
+        ;; Check out != NULL (plaintext_len != 0)
+        cmp     arg3, 0
+        jz      exit_update_dec
+
+        ;; Check in != NULL (plaintext_len != 0)
+        cmp     arg4, 0
+        jz      exit_update_dec
+
+skip_in_out_check_update_dec:
+%endif
+
+        GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call
+
+exit_update_dec:
+        FUNC_RESTORE
+        ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_enc_128_finalize_vaes_avx512 / aes_gcm_enc_192_finalize_vaes_avx512 /
+;       aes_gcm_enc_256_finalize_vaes_avx512
+;       (const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8       *auth_tag,
+;        u64      auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(enc,_finalize_)
+FN_NAME(enc,_finalize_):
+	endbranch
+
+;; All parameters are passed through registers
+%ifdef SAFE_PARAM
+        ;; Check key_data != NULL
+        cmp     arg1, 0
+        jz      exit_enc_fin
+
+        ;; Check context_data != NULL
+        cmp     arg2, 0
+        jz      exit_enc_fin
+
+        ;; Check auth_tag != NULL
+        cmp     arg3, 0
+        jz      exit_enc_fin
+
+        ;; Check auth_tag_len == 0 or > 16
+        cmp     arg4, 0
+        jz      exit_enc_fin
+
+        cmp     arg4, 16
+        ja      exit_enc_fin
+%endif
+
+        FUNC_SAVE
+        GCM_COMPLETE    arg1, arg2, arg3, arg4, ENC, multi_call
+
+        FUNC_RESTORE
+
+exit_enc_fin:
+        ret
+%endif	; _nt
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_dec_128_finalize_vaes_avx512 / aes_gcm_dec_192_finalize_vaes_avx512
+;       aes_gcm_dec_256_finalize_vaes_avx512
+;       (const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8       *auth_tag,
+;        u64      auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(dec,_finalize_)
+FN_NAME(dec,_finalize_):
+	endbranch
+
+;; All parameters are passed through registers
+%ifdef SAFE_PARAM
+        ;; Check key_data != NULL
+        cmp     arg1, 0
+        jz      exit_dec_fin
+
+        ;; Check context_data != NULL
+        cmp     arg2, 0
+        jz      exit_dec_fin
+
+        ;; Check auth_tag != NULL
+        cmp     arg3, 0
+        jz      exit_dec_fin
+
+        ;; Check auth_tag_len == 0 or > 16
+        cmp     arg4, 0
+        jz      exit_dec_fin
+
+        cmp     arg4, 16
+        ja      exit_dec_fin
+%endif
+
+        FUNC_SAVE
+        GCM_COMPLETE    arg1, arg2, arg3, arg4, DEC, multi_call
+
+        FUNC_RESTORE
+
+exit_dec_fin:
+        ret
+%endif	; _nt
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_enc_128_vaes_avx512 / aes_gcm_enc_192_vaes_avx512 / aes_gcm_enc_256_vaes_avx512
+;       (const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8       *out,
+;        const u8 *in,
+;        u64      plaintext_len,
+;        u8       *iv,
+;        const u8 *aad,
+;        u64      aad_len,
+;        u8       *auth_tag,
+;        u64      auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(enc,_)
+FN_NAME(enc,_):
+	endbranch
+
+        FUNC_SAVE
+
+%ifdef SAFE_PARAM
+        ;; Check key_data != NULL
+        cmp     arg1, 0
+        jz      exit_enc
+
+        ;; Check context_data != NULL
+        cmp     arg2, 0
+        jz      exit_enc
+
+        ;; Check IV != NULL
+        cmp     arg6, 0
+        jz      exit_enc
+
+        ;; Check auth_tag != NULL
+        cmp     arg9, 0
+        jz      exit_enc
+
+        ;; Check auth_tag_len == 0 or > 16
+        cmp     arg10, 0
+        jz      exit_enc
+
+        cmp     arg10, 16
+        ja      exit_enc
+
+        ;; Check if plaintext_len == 0
+        cmp     arg5, 0
+        jz      skip_in_out_check_enc
+
+        ;; Check out != NULL (plaintext_len != 0)
+        cmp     arg3, 0
+        jz      exit_enc
+
+        ;; Check in != NULL (plaintext_len != 0)
+        cmp     arg4, 0
+        jz      exit_enc
+
+skip_in_out_check_enc:
+        ;; Check if aad_len == 0
+        cmp     arg8, 0
+        jz      skip_aad_check_enc
+
+        ;; Check aad != NULL (aad_len != 0)
+        cmp     arg7, 0
+        jz      exit_enc
+
+skip_aad_check_enc:
+%endif
+        GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \
+                zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10
+        GCM_ENC_DEC  arg1, arg2, arg3, arg4, arg5, ENC, single_call
+        GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call
+
+exit_enc:
+        FUNC_RESTORE
+        ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void   aes_gcm_dec_128_vaes_avx512 / aes_gcm_dec_192_vaes_avx512 / aes_gcm_dec_256_vaes_avx512
+;       (const struct gcm_key_data *key_data,
+;        struct gcm_context_data *context_data,
+;        u8       *out,
+;        const u8 *in,
+;        u64      plaintext_len,
+;        u8       *iv,
+;        const u8 *aad,
+;        u64      aad_len,
+;        u8       *auth_tag,
+;        u64      auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(dec,_)
+FN_NAME(dec,_):
+	endbranch
+
+        FUNC_SAVE
+
+%ifdef SAFE_PARAM
+        ;; Check key_data != NULL
+        cmp     arg1, 0
+        jz      exit_dec
+
+        ;; Check context_data != NULL
+        cmp     arg2, 0
+        jz      exit_dec
+
+        ;; Check IV != NULL
+        cmp     arg6, 0
+        jz      exit_dec
+
+        ;; Check auth_tag != NULL
+        cmp     arg9, 0
+        jz      exit_dec
+
+        ;; Check auth_tag_len == 0 or > 16
+        cmp     arg10, 0
+        jz      exit_dec
+
+        cmp     arg10, 16
+        ja      exit_dec
+
+        ;; Check if plaintext_len == 0
+        cmp     arg5, 0
+        jz      skip_in_out_check_dec
+
+        ;; Check out != NULL (plaintext_len != 0)
+        cmp     arg3, 0
+        jz      exit_dec
+
+        ;; Check in != NULL (plaintext_len != 0)
+        cmp     arg4, 0
+        jz      exit_dec
+
+skip_in_out_check_dec:
+        ;; Check if aad_len == 0
+        cmp     arg8, 0
+        jz      skip_aad_check_dec
+
+        ;; Check aad != NULL (aad_len != 0)
+        cmp     arg7, 0
+        jz      exit_dec
+
+skip_aad_check_dec:
+%endif
+        GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \
+                zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10
+        GCM_ENC_DEC  arg1, arg2, arg3, arg4, arg5, DEC, single_call
+        GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call
+
+exit_dec:
+        FUNC_RESTORE
+        ret
+
+%else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_ %+ FN_NAME(avx512,_)
+no_ %+ FN_NAME(avx512,_) %+ :
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h
new file mode 100644
index 000000000..8287198ae
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h
@@ -0,0 +1,476 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef AES_GCM_VECTORS_H_
+#define AES_GCM_VECTORS_H_
+
+#include <stdint.h>
+
+typedef enum gcm_key_size { BITS_128 = 16, BITS_256 = 32 } gcm_key_size;
+#define KBITS(K)    (sizeof(K))
+
+// struct to hold pointers to the key, plaintext and ciphertext vectors
+typedef struct gcm_vector {
+	uint8_t*       K;          // AES Key
+	gcm_key_size Klen;         // length of key in bits
+	uint8_t*       IV;         // initial value used by GCM
+	uint64_t       IVlen;      // length of IV in bytes
+	uint8_t*       A;          // additional authenticated data
+	uint64_t       Alen;       // length of AAD in bytes
+	uint8_t*       P;          // Plain text
+	uint64_t       Plen;       // length of our plaintext
+	//outputs of encryption
+	uint8_t*       C;          // same length as PT
+	uint8_t*       T;          // Authentication tag
+	uint8_t        Tlen;       // AT length can be 0 to 128bits
+} gcm_vector;
+
+///////
+// 60-Byte Packet Encryption Using GCM-AES-128
+//   http://www.ieee802.org/1/files/public/docs2011/bn-randall-test-vectors-0511-v1.pdf
+// K:   AD7A2BD03EAC835A6F620FDCB506B345
+// IV:  12153524C0895E81B2C28465
+// AAD: D609B1F056637A0D46DF998D88E52E00
+//      B2C2846512153524C0895E81
+// P:   08000F101112131415161718191A1B1C
+//      1D1E1F202122232425262728292A2B2C
+//      2D2E2F303132333435363738393A0002
+// C:   701AFA1CC039C0D765128A665DAB6924
+//      3899BF7318CCDC81C9931DA17FBE8EDD
+//      7D17CB8B4C26FC81E3284F2B7FBA713D
+// AT:  4F8D55E7D3F06FD5A13C0C29B9D5B880
+// H:   73A23D80121DE2D5A850253FCF43120E
+///////
+static uint8_t K1[] = {0xAD, 0x7A, 0x2B, 0xD0, 0x3E, 0xAC, 0x83, 0x5A, 0x6F, 0x62, 0x0F, 0xDC, 0xB5, 0x06, 0xB3, 0x45};
+static uint8_t P1[] = {
+	  0x08, 0x00, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C
+	, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C
+	, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x00, 0x02
+};
+static uint8_t IV1[] = {0x12, 0x15, 0x35, 0x24, 0xC0, 0x89, 0x5E, 0x81, 0xB2, 0xC2, 0x84, 0x65};
+static uint8_t A1[] = {
+	  0xD6, 0x09, 0xB1, 0xF0, 0x56, 0x63, 0x7A, 0x0D, 0x46, 0xDF, 0x99, 0x8D, 0x88, 0xE5, 0x2E, 0x00
+	, 0xB2, 0xC2, 0x84, 0x65, 0x12, 0x15, 0x35, 0x24, 0xC0, 0x89, 0x5E, 0x81
+};
+#define A1_len sizeof(A1)
+static uint8_t C1[] = {
+	    0x70, 0x1A, 0xFA, 0x1C, 0xC0, 0x39, 0xC0, 0xD7, 0x65, 0x12, 0x8A, 0x66, 0x5D, 0xAB, 0x69, 0x24
+	  , 0x38, 0x99, 0xBF, 0x73, 0x18, 0xCC, 0xDC, 0x81, 0xC9, 0x93, 0x1D, 0xA1, 0x7F, 0xBE, 0x8E, 0xDD
+	  , 0x7D, 0x17, 0xCB, 0x8B, 0x4C, 0x26, 0xFC, 0x81, 0xE3, 0x28, 0x4F, 0x2B, 0x7F, 0xBA, 0x71, 0x3D
+};
+static uint8_t T1[] = {
+		  0x4F, 0x8D, 0x55, 0xE7, 0xD3, 0xF0, 0x6F, 0xD5, 0xA1, 0x3C, 0x0C, 0x29, 0xB9, 0xD5, 0xB8, 0x80
+};
+
+
+///////
+// 54-Byte Packet Encryption Using GCM-AES-128
+//   http://www.ieee802.org/1/files/public/docs2011/bn-randall-test-vectors-0511-v1.pdf
+// K:   071B113B0CA743FECCCF3D051F737382
+// IV:  F0761E8DCD3D000176D457ED
+// AAD: E20106D7CD0DF0761E8DCD3D88E54C2A
+//      76D457ED
+// P:   08000F101112131415161718191A1B1C
+//      1D1E1F202122232425262728292A2B2C
+//      2D2E2F30313233340004
+// C:   13B4C72B389DC5018E72A171DD85A5D3
+//      752274D3A019FBCAED09A425CD9B2E1C
+//      9B72EEE7C9DE7D52B3F3
+// AT:  D6A5284F4A6D3FE22A5D6C2B960494C3
+// H:   E4E01725D724C1215C7309AD34539257
+///////
+static uint8_t K2[] = {0x07, 0x1B, 0x11, 0x3B, 0x0C, 0xA7, 0x43, 0xFE, 0xCC, 0xCF, 0x3D, 0x05, 0x1F, 0x73, 0x73, 0x82};
+static uint8_t P2[] = {
+	  0x08, 0x00, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C
+	, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C
+	, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x00, 0x04
+};
+static uint8_t IV2[] = {0xF0, 0x76, 0x1E, 0x8D, 0xCD, 0x3D, 0x00, 0x01, 0x76, 0xD4, 0x57, 0xED};
+//static uint8_t IV1p[] = {0, 0, 0, 1};
+static uint8_t A2[] = {
+	  0xE2, 0x01, 0x06, 0xD7, 0xCD, 0x0D, 0xF0, 0x76, 0x1E, 0x8D, 0xCD, 0x3D, 0x88, 0xE5, 0x4C, 0x2A
+	, 0x76, 0xD4, 0x57, 0xED
+};
+#define A2_len sizeof(A2)
+static uint8_t C2[] = {
+	  0x13, 0xB4, 0xC7, 0x2B, 0x38, 0x9D, 0xC5, 0x01, 0x8E, 0x72, 0xA1, 0x71, 0xDD, 0x85, 0xA5, 0xD3
+	, 0x75, 0x22, 0x74, 0xD3, 0xA0, 0x19, 0xFB, 0xCA, 0xED, 0x09, 0xA4, 0x25, 0xCD, 0x9B, 0x2E, 0x1C
+	, 0x9B, 0x72, 0xEE, 0xE7, 0xC9, 0xDE, 0x7D, 0x52, 0xB3, 0xF3
+};
+static uint8_t T2[] = {
+	  0xD6, 0xA5, 0x28, 0x4F, 0x4A, 0x6D, 0x3F, 0xE2, 0x2A, 0x5D, 0x6C, 0x2B, 0x96, 0x04, 0x94, 0xC3
+};
+
+
+///////
+// http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp
+// [Keylen = 128]
+// [IVlen = 96]
+// [PTlen = 128]
+// [AADlen = 128]
+// [Taglen = 128]
+// Count = 0
+// K:   c939cc13397c1d37de6ae0e1cb7c423c
+// IV:  b3d8cc017cbb89b39e0f67e2
+// P:   c3b3c41f113a31b73d9a5cd432103069
+// AAD: 24825602bd12a984e0092d3e448eda5f
+// C:   93fe7d9e9bfd10348a5606e5cafa7354
+// AT:  0032a1dc85f1c9786925a2e71d8272dd
+///////
+static uint8_t K3[] =  {0xc9, 0x39, 0xcc, 0x13, 0x39, 0x7c, 0x1d, 0x37, 0xde, 0x6a, 0xe0, 0xe1, 0xcb, 0x7c, 0x42, 0x3c};
+static uint8_t IV3[] = {0xb3, 0xd8, 0xcc, 0x01, 0x7c, 0xbb, 0x89, 0xb3, 0x9e, 0x0f, 0x67, 0xe2};
+static uint8_t P3[] =  {0xc3, 0xb3, 0xc4, 0x1f, 0x11, 0x3a, 0x31, 0xb7, 0x3d, 0x9a, 0x5c, 0xd4, 0x32, 0x10, 0x30, 0x69};
+static uint8_t A3[] =  {0x24, 0x82, 0x56, 0x02, 0xbd, 0x12, 0xa9, 0x84, 0xe0, 0x09, 0x2d, 0x3e, 0x44, 0x8e, 0xda, 0x5f};
+#define A3_len sizeof(A3)
+static uint8_t C3[] =  {0x93, 0xfe, 0x7d, 0x9e, 0x9b, 0xfd, 0x10, 0x34, 0x8a, 0x56, 0x06, 0xe5, 0xca, 0xfa, 0x73, 0x54};
+static uint8_t T3[] =  {0x00, 0x32, 0xa1, 0xdc, 0x85, 0xf1, 0xc9, 0x78, 0x69, 0x25, 0xa2, 0xe7, 0x1d, 0x82, 0x72, 0xdd};
+
+///////
+// http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp
+// [Keylen = 128]
+// [IVlen = 96]
+// [PTlen = 256]
+// [AADlen = 128]
+// [Taglen = 128]
+// Count = 0
+// K = 298efa1ccf29cf62ae6824bfc19557fc
+// IV = 6f58a93fe1d207fae4ed2f6d
+// P = cc38bccd6bc536ad919b1395f5d63801f99f8068d65ca5ac63872daf16b93901
+// AAD = 021fafd238463973ffe80256e5b1c6b1
+// C = dfce4e9cd291103d7fe4e63351d9e79d3dfd391e3267104658212da96521b7db
+// T = 542465ef599316f73a7a560509a2d9f2
+///////
+static uint8_t K4[] =  {0x29, 0x8e, 0xfa, 0x1c, 0xcf, 0x29, 0xcf, 0x62, 0xae, 0x68, 0x24, 0xbf, 0xc1, 0x95, 0x57, 0xfc};
+static uint8_t IV4[] = {0x6f, 0x58, 0xa9, 0x3f, 0xe1, 0xd2, 0x07, 0xfa, 0xe4, 0xed, 0x2f, 0x6d};
+static uint8_t P4[] =  {0xcc, 0x38, 0xbc, 0xcd, 0x6b, 0xc5, 0x36, 0xad, 0x91, 0x9b, 0x13, 0x95, 0xf5, 0xd6, 0x38, 0x01, 0xf9, 0x9f, 0x80, 0x68, 0xd6, 0x5c, 0xa5, 0xac, 0x63, 0x87, 0x2d, 0xaf, 0x16, 0xb9, 0x39, 0x01};
+static uint8_t A4[] =  {0x02, 0x1f, 0xaf, 0xd2, 0x38, 0x46, 0x39, 0x73, 0xff, 0xe8, 0x02, 0x56, 0xe5, 0xb1, 0xc6, 0xb1};
+#define A4_len sizeof(A4)
+static uint8_t C4[] =  {0xdf, 0xce, 0x4e, 0x9c, 0xd2, 0x91, 0x10, 0x3d, 0x7f, 0xe4, 0xe6, 0x33, 0x51, 0xd9, 0xe7, 0x9d, 0x3d, 0xfd, 0x39, 0x1e, 0x32, 0x67, 0x10, 0x46, 0x58, 0x21, 0x2d, 0xa9, 0x65, 0x21, 0xb7, 0xdb};
+static uint8_t T4[] =  {0x54, 0x24, 0x65, 0xef, 0x59, 0x93, 0x16, 0xf7, 0x3a, 0x7a, 0x56, 0x05, 0x09, 0xa2, 0xd9, 0xf2};
+
+///////
+// http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp
+// [Keylen = 128]
+// [IVlen = 96]
+// [PTlen = 256]
+// [AADlen = 128]
+// [Taglen = 128]
+// Count = 0
+// K = 298efa1ccf29cf62ae6824bfc19557fc
+// IV = 6f58a93fe1d207fae4ed2f6d
+// P = cc38bccd6bc536ad919b1395f5d63801f99f8068d65ca5ac63872daf16b93901
+// AAD = 021fafd238463973ffe80256e5b1c6b1
+// C = dfce4e9cd291103d7fe4e63351d9e79d3dfd391e3267104658212da96521b7db
+// T = 542465ef599316f73a7a560509a2d9f2
+///////
+static uint8_t K5[] =  {0x29, 0x8e, 0xfa, 0x1c, 0xcf, 0x29, 0xcf, 0x62, 0xae, 0x68, 0x24, 0xbf, 0xc1, 0x95, 0x57, 0xfc};
+static uint8_t IV5[] = {0x6f, 0x58, 0xa9, 0x3f, 0xe1, 0xd2, 0x07, 0xfa, 0xe4, 0xed, 0x2f, 0x6d};
+static uint8_t P5[] =  {0xcc, 0x38, 0xbc, 0xcd, 0x6b, 0xc5, 0x36, 0xad, 0x91, 0x9b, 0x13, 0x95, 0xf5, 0xd6, 0x38, 0x01, 0xf9, 0x9f, 0x80, 0x68, 0xd6, 0x5c, 0xa5, 0xac, 0x63, 0x87, 0x2d, 0xaf, 0x16, 0xb9, 0x39, 0x01};
+static uint8_t A5[] =  {0x02, 0x1f, 0xaf, 0xd2, 0x38, 0x46, 0x39, 0x73, 0xff, 0xe8, 0x02, 0x56, 0xe5, 0xb1, 0xc6, 0xb1};
+#define A5_len sizeof(A5)
+static uint8_t C5[] =  {0xdf, 0xce, 0x4e, 0x9c, 0xd2, 0x91, 0x10, 0x3d, 0x7f, 0xe4, 0xe6, 0x33, 0x51, 0xd9, 0xe7, 0x9d, 0x3d, 0xfd, 0x39, 0x1e, 0x32, 0x67, 0x10, 0x46, 0x58, 0x21, 0x2d, 0xa9, 0x65, 0x21, 0xb7, 0xdb};
+static uint8_t T5[] =  {0x54, 0x24, 0x65, 0xef, 0x59, 0x93, 0x16, 0xf7, 0x3a, 0x7a, 0x56, 0x05, 0x09, 0xa2, 0xd9, 0xf2};
+
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 2
+// K:  00000000000000000000000000000000
+// P:  00000000000000000000000000000000
+// IV: 000000000000000000000000
+// C:  0388dace60b6a392f328c2b971b2fe78
+// T:  ab6e47d42cec13bdf53a67b21257bddf
+// H:  66e94bd4ef8a2c3b884cfa59ca342b2e
+///////
+static uint8_t K6[] =  {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+static uint8_t P6[] =  {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+static uint8_t IV6[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+static uint8_t A6[] =  {0};
+#define A6_len 0
+static uint8_t C6[] =  {0x03, 0x88, 0xda, 0xce, 0x60, 0xb6, 0xa3, 0x92, 0xf3, 0x28, 0xc2, 0xb9, 0x71, 0xb2, 0xfe, 0x78};
+static uint8_t T6[] =  {0xab, 0x6e, 0x47, 0xd4, 0x2c, 0xec, 0x13, 0xbd, 0xf5, 0x3a, 0x67, 0xb2, 0x12, 0x57, 0xbd, 0xdf};
+
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 3
+// K:  feffe9928665731c6d6a8f9467308308
+// P:  d9313225f88406e5a55909c5aff5269a
+//     86a7a9531534f7da2e4c303d8a318a72
+//     1c3c0c95956809532fcf0e2449a6b525
+//     b16aedf5aa0de657ba637b391aafd255
+// IV: cafebabefacedbaddecaf888
+// H:  b83b533708bf535d0aa6e52980d53b78
+// C:  42831ec2217774244b7221b784d0d49c
+//     e3aa212f2c02a4e035c17e2329aca12e
+//     21d514b25466931c7d8f6a5aac84aa05
+//     1ba30b396a0aac973d58e091473f5985
+// T:  4d5c2af327cd64a62cf35abd2ba6fab4
+///////
+static uint8_t K7[] =  {0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+static uint8_t P7[] =  {0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a
+		, 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72
+		, 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25
+		, 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55};
+static uint8_t IV7[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88};
+static uint8_t A7[] =  {0};
+#define A7_len 0
+static uint8_t C7[] =  {0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c
+		, 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e
+		, 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05
+		, 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, 0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85};
+static uint8_t T7[] =  {0x4d, 0x5c, 0x2a, 0xf3, 0x27, 0xcd, 0x64, 0xa6, 0x2c, 0xf3, 0x5a, 0xbd, 0x2b, 0xa6, 0xfa, 0xb4};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 4
+// K:  feffe9928665731c6d6a8f9467308308
+// P:  d9313225f88406e5a55909c5aff5269a
+//     86a7a9531534f7da2e4c303d8a318a72
+//     1c3c0c95956809532fcf0e2449a6b525
+//     b16aedf5aa0de657ba637b39
+// A:  feedfacedeadbeeffeedfacedeadbeef
+//     abaddad2
+// IV: cafebabefacedbaddecaf888
+// H:  b83b533708bf535d0aa6e52980d53b78
+// C:  42831ec2217774244b7221b784d0d49c
+//     e3aa212f2c02a4e035c17e2329aca12e
+//     21d514b25466931c7d8f6a5aac84aa05
+//     1ba30b396a0aac973d58e091
+// T:  5bc94fbc3221a5db94fae95ae7121a47
+///////
+static uint8_t K8[] =  {0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+static uint8_t P8[] =  {
+		  0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a
+		, 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72
+		, 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25
+		, 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39};
+static uint8_t A8[] =  {0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef
+		, 0xab, 0xad, 0xda, 0xd2};
+#define A8_len sizeof(A8)
+static uint8_t IV8[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88};
+static uint8_t C8[] =  {0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c
+		, 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e
+		, 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05
+		, 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, 0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85};
+static uint8_t T8[] =  {0x5b, 0xc9, 0x4f, 0xbc, 0x32, 0x21, 0xa5, 0xdb, 0x94, 0xfa, 0xe9, 0x5a, 0xe7, 0x12, 0x1a, 0x47};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 14
+// K:  00000000000000000000000000000000
+//     00000000000000000000000000000000
+// P:  00000000000000000000000000000000
+// A:
+// IV: 000000000000000000000000
+// H:  dc95c078a2408989ad48a21492842087
+// C:  cea7403d4d606b6e074ec5d3baf39d18
+// T:  d0d1c8a799996bf0265b98b5d48ab919
+///////
+static uint8_t K9[] =  {
+		0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
+static uint8_t P9[] =  {
+		0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+};
+static uint8_t A9[] =  {0};
+#define A9_len 0
+static uint8_t IV9[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
+static uint8_t C9[] =  {
+		0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e, 0x07, 0x4e, 0xc5, 0xd3, 0xba, 0xf3, 0x9d, 0x18
+};
+static uint8_t T9[] =  {0xd0, 0xd1, 0xc8, 0xa7, 0x99, 0x99, 0x6b, 0xf0, 0x26, 0x5b, 0x98, 0xb5, 0xd4, 0x8a, 0xb9, 0x19};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 15
+// K:  feffe9928665731c6d6a8f9467308308
+//     feffe9928665731c6d6a8f9467308308
+// P:  d9313225f88406e5a55909c5aff5269a
+//     86a7a9531534f7da2e4c303d8a318a72
+//     1c3c0c95956809532fcf0e2449a6b525
+//     b16aedf5aa0de657ba637b391aafd255
+// A:
+// IV: cafebabefacedbaddecaf888
+// H:  acbef20579b4b8ebce889bac8732dad7
+// C:  522dc1f099567d07f47f37a32a84427d
+//     643a8cdcbfe5c0c97598a2bd2555d1aa
+//     8cb08e48590dbb3da7b08b1056828838
+//     c5f61e6393ba7a0abcc9f662898015ad
+// T:  b094dac5d93471bdec1a502270e3cc6c
+///////
+static uint8_t K10[] =  {
+		0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
+		0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+static uint8_t P10[] =  {
+		0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+		0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+		0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+		0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
+};
+static uint8_t A10[] =  {0};
+#define A10_len 0
+static uint8_t IV10[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88};
+static uint8_t C10[] =  {
+		0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
+		0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
+		0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
+		0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, 0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad
+};
+static uint8_t T10[] =  {
+		0xb0, 0x94, 0xda, 0xc5, 0xd9, 0x34, 0x71, 0xbd, 0xec, 0x1a, 0x50, 0x22, 0x70, 0xe3, 0xcc, 0x6c};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 16
+// K:  feffe9928665731c6d6a8f9467308308
+//     feffe9928665731c6d6a8f9467308308
+// P:  d9313225f88406e5a55909c5aff5269a
+//     86a7a9531534f7da2e4c303d8a318a72
+//     1c3c0c95956809532fcf0e2449a6b525
+//     b16aedf5aa0de657ba637b39
+// A:  feedfacedeadbeeffeedfacedeadbeef
+//     abaddad2
+// IV: cafebabefacedbaddecaf888
+// H:  acbef20579b4b8ebce889bac8732dad7
+// C:  522dc1f099567d07f47f37a32a84427d
+//     643a8cdcbfe5c0c97598a2bd2555d1aa
+//     8cb08e48590dbb3da7b08b1056828838
+//     c5f61e6393ba7a0abcc9f662
+// T:  76fc6ece0f4e1768cddf8853bb2d551b
+///////
+static uint8_t K11[] =  {
+		0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
+		0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+static uint8_t P11[] =  {
+		0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+		0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+		0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+		0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39
+};
+static uint8_t A11[] =  {
+		0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+		0xab, 0xad, 0xda, 0xd2};
+#define A11_len sizeof(A11)
+static uint8_t IV11[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88};
+static uint8_t C11[] =  {
+		0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
+		0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
+		0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
+		0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, 0xbc, 0xc9, 0xf6, 0x62
+};
+static uint8_t T11[] =  {0x76, 0xfc, 0x6e, 0xce, 0x0f, 0x4e, 0x17, 0x68, 0xcd, 0xdf, 0x88, 0x53, 0xbb, 0x2d, 0x55, 0x1b};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 17  -- Not supported IV length less than 12 bytes
+// K:  feffe9928665731c6d6a8f9467308308
+//     feffe9928665731c6d6a8f9467308308
+// P:  d9313225f88406e5a55909c5aff5269a
+//     86a7a9531534f7da2e4c303d8a318a72
+//     1c3c0c95956809532fcf0e2449a6b525
+//     b16aedf5aa0de657ba637b39
+// A:  feedfacedeadbeeffeedfacedeadbeef
+//     abaddad2
+// IV: cafebabefacedbad
+// H:  acbef20579b4b8ebce889bac8732dad7
+// C:  c3762df1ca787d32ae47c13bf19844cb
+//     af1ae14d0b976afac52ff7d79bba9de0
+//     feb582d33934a4f0954cc2363bc73f78
+//     62ac430e64abe499f47c9b1f
+// T:  3a337dbf46a792c45e454913fe2ea8f2
+///////
+//static uint8_t K12[] =  {
+//		0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
+//		0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+//static uint8_t P12[] =  {
+//		0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+//		0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+//		0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+//		0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39
+//};
+//static uint8_t A12[] =  {
+//		0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+//		0xab, 0xad, 0xda, 0xd2};
+//static uint8_t IV12[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad};
+//static uint8_t H12[] =  {
+//		0xac, 0xbe, 0xf2, 0x05, 0x79, 0xb4, 0xb8, 0xeb, 0xce, 0x88, 0x9b, 0xac, 0x87, 0x32, 0xda, 0xd7};
+//static uint8_t C12[] =  {
+//		0xc3, 0x76, 0x2d, 0xf1, 0xca, 0x78, 0x7d, 0x32, 0xae, 0x47, 0xc1, 0x3b, 0xf1, 0x98, 0x44, 0xcb,
+//		0xaf, 0x1a, 0xe1, 0x4d, 0x0b, 0x97, 0x6a, 0xfa, 0xc5, 0x2f, 0xf7, 0xd7, 0x9b, 0xba, 0x9d, 0xe0,
+//		0xfe, 0xb5, 0x82, 0xd3, 0x39, 0x34, 0xa4, 0xf0, 0x95, 0x4c, 0xc2, 0x36, 0x3b, 0xc7, 0x3f, 0x78,
+//		0x62, 0xac, 0x43, 0x0e, 0x64, 0xab, 0xe4, 0x99, 0xf4, 0x7c, 0x9b, 0x1f
+//};
+//static uint8_t T12[] =  {
+//		0x3a, 0x33, 0x7d, 0xbf, 0x46, 0xa7, 0x92, 0xc4, 0x5e, 0x45, 0x49, 0x13, 0xfe, 0x2e, 0xa8, 0xf2};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 18 -- Not supported IV length greater than 12 bytes
+// K:  feffe9928665731c6d6a8f9467308308
+//     feffe9928665731c6d6a8f9467308308
+// P:  d9313225f88406e5a55909c5aff5269a
+//     86a7a9531534f7da2e4c303d8a318a72
+//     1c3c0c95956809532fcf0e2449a6b525
+//     b16aedf5aa0de657ba637b39
+// A:  feedfacedeadbeeffeedfacedeadbeef
+//     abaddad2
+// IV: 9313225df88406e555909c5aff5269aa
+//     6a7a9538534f7da1e4c303d2a318a728
+//     c3c0c95156809539fcf0e2429a6b5254
+//     16aedbf5a0de6a57a637b39b
+// H:  acbef20579b4b8ebce889bac8732dad7
+// C:  5a8def2f0c9e53f1f75d7853659e2a20
+//     eeb2b22aafde6419a058ab4f6f746bf4
+//     0fc0c3b780f244452da3ebf1c5d82cde
+//     a2418997200ef82e44ae7e3f
+// T:  a44a8266ee1c8eb0c8b5d4cf5ae9f19a
+///////
+
+
+#define vector(N) {K##N, (KBITS(K##N)), IV##N, sizeof(IV##N), A##N, A##N##_len, P##N, sizeof(P##N), C##N, T##N, sizeof(T##N)}
+
+gcm_vector const  gcm_vectors[] = {
+	//field order {K, Klen, IV, IVlen, A, Alen, P, Plen, C, T, Tlen};
+	// original vector does not have a valid sub hash key
+	vector(1),
+	vector(2),
+	vector(3),
+	vector(4),
+	vector(5),
+	vector(6),
+	vector(7),
+	vector(8),
+	vector(9),
+	vector(10),
+	vector(11),
+	/* vector(12), -- IV of less than 16bytes are not supported */
+};
+
+#endif /* AES_GCM_VECTORS_H_ */
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm
new file mode 100644
index 000000000..ddae6a4e7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm
@@ -0,0 +1,328 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Routine to do AES key expansion
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+%macro key_expansion_128_sse 0
+	;; Assumes the xmm3 includes all zeros at this point.
+        pshufd	xmm2, xmm2, 11111111b
+        shufps	xmm3, xmm1, 00010000b
+        pxor	xmm1, xmm3
+        shufps	xmm3, xmm1, 10001100b
+        pxor	xmm1, xmm3
+	pxor	xmm1, xmm2
+%endmacro
+
+%macro key_expansion_128_avx 0
+	;; Assumes the xmm3 includes all zeros at this point.
+        vpshufd	xmm2, xmm2, 11111111b
+        vshufps	xmm3, xmm3, xmm1, 00010000b
+        vpxor	xmm1, xmm1, xmm3
+        vshufps	xmm3, xmm3, xmm1, 10001100b
+        vpxor	xmm1, xmm1, xmm3
+	vpxor	xmm1, xmm1, xmm2
+%endmacro
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define KEY		rdi
+%define EXP_ENC_KEYS	rsi
+%define EXP_DEC_KEYS	rdx
+%else
+%define KEY		rcx
+%define EXP_ENC_KEYS	rdx
+%define EXP_DEC_KEYS	r8
+%endif
+
+
+; void aes_keyexp_128(UINT8 *key,
+;                     UINT8 *enc_exp_keys,
+;                     UINT8 *dec_exp_keys);
+;
+; arg 1: rcx: pointer to key
+; arg 2: rdx: pointer to expanded key array for encrypt
+; arg 3: r8:  pointer to expanded key array for decrypt
+;
+mk_global aes_keyexp_128_sse, function
+aes_keyexp_128_sse:
+	endbranch
+        movdqu	xmm1, [KEY]	; loading the AES key
+	movdqu	[EXP_ENC_KEYS + 16*0], xmm1
+        movdqu	[EXP_DEC_KEYS + 16*10], xmm1  ; Storing key in memory
+	pxor	xmm3, xmm3
+
+        aeskeygenassist	xmm2, xmm1, 0x1     ; Generating round key 1
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*1], xmm1
+        aesimc	xmm4, xmm1
+        movdqu	[EXP_DEC_KEYS + 16*9], xmm4
+
+        aeskeygenassist xmm2, xmm1, 0x2     ; Generating round key 2
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*2], xmm1
+        aesimc	xmm5, xmm1
+        movdqu	[EXP_DEC_KEYS + 16*8], xmm5
+
+        aeskeygenassist xmm2, xmm1, 0x4     ; Generating round key 3
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*3], xmm1
+        aesimc	xmm4, xmm1
+        movdqu	[EXP_DEC_KEYS + 16*7], xmm4
+
+        aeskeygenassist xmm2, xmm1, 0x8     ; Generating round key 4
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*4], xmm1
+        aesimc	xmm5, xmm1
+        movdqu	[EXP_DEC_KEYS + 16*6], xmm5
+
+        aeskeygenassist xmm2, xmm1, 0x10    ; Generating round key 5
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*5], xmm1
+        aesimc	xmm4, xmm1
+        movdqu	[EXP_DEC_KEYS + 16*5], xmm4
+
+        aeskeygenassist xmm2, xmm1, 0x20    ; Generating round key 6
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*6], xmm1
+        aesimc	xmm5, xmm1
+        movdqu	[EXP_DEC_KEYS + 16*4], xmm5
+
+        aeskeygenassist xmm2, xmm1, 0x40    ; Generating round key 7
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*7], xmm1
+        aesimc	xmm4, xmm1
+        movdqu	[EXP_DEC_KEYS + 16*3], xmm4
+
+        aeskeygenassist xmm2, xmm1, 0x80    ; Generating round key 8
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*8], xmm1
+        aesimc	xmm5, xmm1
+        movdqu	[EXP_DEC_KEYS + 16*2], xmm5
+
+        aeskeygenassist xmm2, xmm1, 0x1b    ; Generating round key 9
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*9], xmm1
+        aesimc	xmm4, xmm1
+        movdqu	[EXP_DEC_KEYS + 16*1], xmm4
+
+        aeskeygenassist xmm2, xmm1, 0x36    ; Generating round key 10
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*10], xmm1
+        movdqu	[EXP_DEC_KEYS + 16*0], xmm1
+
+	ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+mk_global aes_keyexp_128_avx, function
+aes_keyexp_128_avx:
+	endbranch
+        vmovdqu	xmm1, [KEY]	; loading the AES key
+	vmovdqu	[EXP_ENC_KEYS + 16*0], xmm1
+        vmovdqu	[EXP_DEC_KEYS + 16*10], xmm1  ; Storing key in memory
+	vpxor	xmm3, xmm3, xmm3
+
+        vaeskeygenassist	xmm2, xmm1, 0x1     ; Generating round key 1
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*1], xmm1
+        vaesimc	xmm4, xmm1
+        vmovdqu	[EXP_DEC_KEYS + 16*9], xmm4
+
+        vaeskeygenassist xmm2, xmm1, 0x2     ; Generating round key 2
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*2], xmm1
+        vaesimc	xmm5, xmm1
+        vmovdqu	[EXP_DEC_KEYS + 16*8], xmm5
+
+        vaeskeygenassist xmm2, xmm1, 0x4     ; Generating round key 3
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*3], xmm1
+        vaesimc	xmm4, xmm1
+        vmovdqu	[EXP_DEC_KEYS + 16*7], xmm4
+
+        vaeskeygenassist xmm2, xmm1, 0x8     ; Generating round key 4
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*4], xmm1
+        vaesimc	xmm5, xmm1
+        vmovdqu	[EXP_DEC_KEYS + 16*6], xmm5
+
+        vaeskeygenassist xmm2, xmm1, 0x10    ; Generating round key 5
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*5], xmm1
+        vaesimc	xmm4, xmm1
+        vmovdqu	[EXP_DEC_KEYS + 16*5], xmm4
+
+        vaeskeygenassist xmm2, xmm1, 0x20    ; Generating round key 6
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*6], xmm1
+        vaesimc	xmm5, xmm1
+        vmovdqu	[EXP_DEC_KEYS + 16*4], xmm5
+
+        vaeskeygenassist xmm2, xmm1, 0x40    ; Generating round key 7
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*7], xmm1
+        vaesimc	xmm4, xmm1
+        vmovdqu	[EXP_DEC_KEYS + 16*3], xmm4
+
+        vaeskeygenassist xmm2, xmm1, 0x80    ; Generating round key 8
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*8], xmm1
+        vaesimc	xmm5, xmm1
+        vmovdqu	[EXP_DEC_KEYS + 16*2], xmm5
+
+        vaeskeygenassist xmm2, xmm1, 0x1b    ; Generating round key 9
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*9], xmm1
+        vaesimc	xmm4, xmm1
+        vmovdqu	[EXP_DEC_KEYS + 16*1], xmm4
+
+        vaeskeygenassist xmm2, xmm1, 0x36    ; Generating round key 10
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*10], xmm1
+        vmovdqu	[EXP_DEC_KEYS + 16*0], xmm1
+
+	ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; void aes_keyexp_128_enc_sse(UINT8 *key,
+;                             UINT8 *enc_exp_keys);
+;
+; arg 1: rcx: pointer to key
+; arg 2: rdx: pointer to expanded key array for encrypt
+;
+mk_global aes_keyexp_128_enc_sse, function
+aes_keyexp_128_enc_sse:
+	endbranch
+        movdqu	xmm1, [KEY]	; loading the AES key
+	movdqu	[EXP_ENC_KEYS + 16*0], xmm1
+	pxor	xmm3, xmm3
+
+        aeskeygenassist	xmm2, xmm1, 0x1     ; Generating round key 1
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*1], xmm1
+
+        aeskeygenassist xmm2, xmm1, 0x2     ; Generating round key 2
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*2], xmm1
+
+        aeskeygenassist xmm2, xmm1, 0x4     ; Generating round key 3
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*3], xmm1
+
+        aeskeygenassist xmm2, xmm1, 0x8     ; Generating round key 4
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*4], xmm1
+
+        aeskeygenassist xmm2, xmm1, 0x10    ; Generating round key 5
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*5], xmm1
+
+        aeskeygenassist xmm2, xmm1, 0x20    ; Generating round key 6
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*6], xmm1
+
+        aeskeygenassist xmm2, xmm1, 0x40    ; Generating round key 7
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*7], xmm1
+
+        aeskeygenassist xmm2, xmm1, 0x80    ; Generating round key 8
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*8], xmm1
+
+        aeskeygenassist xmm2, xmm1, 0x1b    ; Generating round key 9
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*9], xmm1
+
+        aeskeygenassist xmm2, xmm1, 0x36    ; Generating round key 10
+        key_expansion_128_sse
+	movdqu	[EXP_ENC_KEYS + 16*10], xmm1
+
+	ret
+
+mk_global aes_keyexp_128_enc_avx, function
+aes_keyexp_128_enc_avx:
+	endbranch
+        vmovdqu	xmm1, [KEY]	; loading the AES key
+	vmovdqu	[EXP_ENC_KEYS + 16*0], xmm1
+	vpxor	xmm3, xmm3, xmm3
+
+        vaeskeygenassist	xmm2, xmm1, 0x1     ; Generating round key 1
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*1], xmm1
+
+        vaeskeygenassist xmm2, xmm1, 0x2     ; Generating round key 2
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*2], xmm1
+
+        vaeskeygenassist xmm2, xmm1, 0x4     ; Generating round key 3
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*3], xmm1
+
+        vaeskeygenassist xmm2, xmm1, 0x8     ; Generating round key 4
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*4], xmm1
+
+        vaeskeygenassist xmm2, xmm1, 0x10    ; Generating round key 5
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*5], xmm1
+
+        vaeskeygenassist xmm2, xmm1, 0x20    ; Generating round key 6
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*6], xmm1
+
+        vaeskeygenassist xmm2, xmm1, 0x40    ; Generating round key 7
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*7], xmm1
+
+        vaeskeygenassist xmm2, xmm1, 0x80    ; Generating round key 8
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*8], xmm1
+
+        vaeskeygenassist xmm2, xmm1, 0x1b    ; Generating round key 9
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*9], xmm1
+
+        vaeskeygenassist xmm2, xmm1, 0x36    ; Generating round key 10
+        key_expansion_128_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*10], xmm1
+
+	ret
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm
new file mode 100644
index 000000000..7cde5fb67
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm
@@ -0,0 +1,274 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define KEY		rdi
+%define EXP_ENC_KEYS	rsi
+%define EXP_DEC_KEYS	rdx
+%else
+%define KEY		rcx
+%define EXP_ENC_KEYS	rdx
+%define EXP_DEC_KEYS	r8
+%endif
+
+
+
+
+%macro key_expansion_1_192_sse 1
+	;; Assumes the xmm3 includes all zeros at this point.
+        pshufd xmm2, xmm2, 11111111b
+        shufps xmm3, xmm1, 00010000b
+        pxor xmm1, xmm3
+        shufps xmm3, xmm1, 10001100b
+        pxor xmm1, xmm3
+	pxor xmm1, xmm2
+	movdqu [EXP_ENC_KEYS+%1], xmm1
+%endmacro
+
+; Calculate w10 and w11 using calculated w9 and known w4-w5
+%macro key_expansion_2_192_sse 1
+		movdqu xmm5, xmm4
+		pslldq xmm5, 4
+		shufps xmm6, xmm1, 11110000b
+		pxor xmm6, xmm5
+		pxor xmm4, xmm6
+		pshufd xmm7, xmm4, 00001110b
+		movdqu [EXP_ENC_KEYS+%1], xmm7
+%endmacro
+
+%macro key_dec_192_sse 1
+	movdqu  xmm0, [EXP_ENC_KEYS + 16 * %1]
+	aesimc	xmm1, xmm0
+	movdqu [EXP_DEC_KEYS + 16 * (12 - %1)], xmm1
+%endmacro
+
+
+
+
+
+%macro key_expansion_1_192_avx 1
+	;; Assumes the xmm3 includes all zeros at this point.
+        vpshufd xmm2, xmm2, 11111111b
+        vshufps xmm3, xmm3, xmm1, 00010000b
+        vpxor xmm1, xmm1, xmm3
+        vshufps xmm3, xmm3, xmm1, 10001100b
+        vpxor xmm1, xmm1, xmm3
+	vpxor xmm1, xmm1, xmm2
+	vmovdqu [EXP_ENC_KEYS+%1], xmm1
+%endmacro
+
+; Calculate w10 and w11 using calculated w9 and known w4-w5
+%macro key_expansion_2_192_avx 1
+		vmovdqa xmm5, xmm4
+		vpslldq xmm5, xmm5, 4
+		vshufps xmm6, xmm6, xmm1, 11110000b
+		vpxor xmm6, xmm6, xmm5
+		vpxor xmm4, xmm4, xmm6
+		vpshufd xmm7, xmm4, 00001110b
+		vmovdqu [EXP_ENC_KEYS+%1], xmm7
+%endmacro
+
+%macro key_dec_192_avx 1
+	vmovdqu  xmm0, [EXP_ENC_KEYS + 16 * %1]
+	vaesimc	xmm1, xmm0
+	vmovdqu [EXP_DEC_KEYS + 16 * (12 - %1)], xmm1
+%endmacro
+
+
+
+
+; void aes_keyexp_192(UINT8 *key,
+;                     UINT8 *enc_exp_keys,
+;                     UINT8 *dec_exp_keys);
+;
+; arg 1: rcx: pointer to key
+; arg 2: rdx: pointer to expanded key array for encrypt
+; arg 3: r8:  pointer to expanded key array for decrypt
+;
+mk_global aes_keyexp_192_sse, function
+aes_keyexp_192_sse:
+	endbranch
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+	sub	rsp, 16*2 + 8
+	movdqu	[rsp + 0*16], xmm6
+	movdqu	[rsp + 1*16], xmm7
+%endif
+
+	movq xmm7, [KEY + 16]	; loading the AES key, 64 bits
+        movq [EXP_ENC_KEYS + 16], xmm7  ; Storing key in memory where all key expansion
+        pshufd xmm4, xmm7, 01001111b
+        movdqu xmm1, [KEY]	; loading the AES key, 128 bits
+        movdqu [EXP_ENC_KEYS], xmm1  ; Storing key in memory where all key expansion
+        movdqu [EXP_DEC_KEYS + 16*0], xmm1
+        movdqu [EXP_DEC_KEYS + 16*12], xmm1
+
+        pxor xmm3, xmm3		; Set xmm3 to be all zeros. Required for the key_expansion.
+        pxor xmm6, xmm6		; Set xmm3 to be all zeros. Required for the key_expansion.
+
+        aeskeygenassist xmm2, xmm4, 0x1     ; Complete round key 1 and generate round key 2
+        key_expansion_1_192_sse 24
+		key_expansion_2_192_sse 40
+
+        aeskeygenassist xmm2, xmm4, 0x2     ; Generate round key 3 and part of round key 4
+        key_expansion_1_192_sse 48
+		key_expansion_2_192_sse 64
+
+        aeskeygenassist xmm2, xmm4, 0x4     ; Complete round key 4 and generate round key 5
+        key_expansion_1_192_sse 72
+		key_expansion_2_192_sse 88
+
+        aeskeygenassist xmm2, xmm4, 0x8     ; Generate round key 6 and part of round key 7
+        key_expansion_1_192_sse 96
+		key_expansion_2_192_sse 112
+
+        aeskeygenassist xmm2, xmm4, 0x10     ; Complete round key 7 and generate round key 8
+        key_expansion_1_192_sse 120
+		key_expansion_2_192_sse 136
+
+        aeskeygenassist xmm2, xmm4, 0x20     ; Generate round key 9 and part of round key 10
+        key_expansion_1_192_sse 144
+		key_expansion_2_192_sse 160
+
+        aeskeygenassist xmm2, xmm4, 0x40     ; Complete round key 10 and generate round key 11
+        key_expansion_1_192_sse 168
+		key_expansion_2_192_sse 184
+
+        aeskeygenassist xmm2, xmm4, 0x80     ; Generate round key 12
+        key_expansion_1_192_sse 192
+
+;;;  we have already saved the 12 th key, which is pure input on the
+;;;  ENC key path
+     movdqu  xmm0, [EXP_ENC_KEYS + 16 * 12]
+     movdqu [EXP_DEC_KEYS + 16*0], xmm0
+;;;  generate remaining decrypt keys
+     key_dec_192_sse 1
+     key_dec_192_sse 2
+     key_dec_192_sse 3
+     key_dec_192_sse 4
+     key_dec_192_sse 5
+     key_dec_192_sse 6
+     key_dec_192_sse 7
+     key_dec_192_sse 8
+     key_dec_192_sse 9
+     key_dec_192_sse 10
+     key_dec_192_sse 11
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+	movdqu	xmm6, [rsp + 0*16]
+	movdqu	xmm7, [rsp + 1*16]
+	add	rsp, 16*2 + 8
+%endif
+
+     ret
+
+
+
+mk_global aes_keyexp_192_avx, function
+aes_keyexp_192_avx:
+	endbranch
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+	sub	rsp, 16*2 + 8
+	vmovdqu	[rsp + 0*16], xmm6
+	vmovdqu	[rsp + 1*16], xmm7
+%endif
+
+	vmovq xmm7, [KEY + 16]	; loading the AES key, 64 bits
+        vmovq [EXP_ENC_KEYS + 16], xmm7  ; Storing key in memory where all key expansion
+        vpshufd xmm4, xmm7, 01001111b
+        vmovdqu xmm1, [KEY]	; loading the AES key, 128 bits
+        vmovdqu [EXP_ENC_KEYS], xmm1  ; Storing key in memory where all key expansion
+        vmovdqu [EXP_DEC_KEYS + 16*0], xmm1
+        vmovdqu [EXP_DEC_KEYS + 16*12], xmm1
+
+        vpxor xmm3, xmm3, xmm3
+        vpxor xmm6, xmm6, xmm6
+
+        vaeskeygenassist xmm2, xmm4, 0x1      ; Complete round key 1 and generate round key 2
+        key_expansion_1_192_avx 24
+		key_expansion_2_192_avx 40
+
+        vaeskeygenassist xmm2, xmm4, 0x2     ; Generate round key 3 and part of round key 4
+        key_expansion_1_192_avx 48
+		key_expansion_2_192_avx 64
+
+        vaeskeygenassist xmm2, xmm4, 0x4     ; Complete round key 4 and generate round key 5
+        key_expansion_1_192_avx 72
+		key_expansion_2_192_avx 88
+
+        vaeskeygenassist xmm2, xmm4, 0x8     ; Generate round key 6 and part of round key 7
+        key_expansion_1_192_avx 96
+		key_expansion_2_192_avx 112
+
+        vaeskeygenassist xmm2, xmm4, 0x10    ; Complete round key 7 and generate round key 8
+        key_expansion_1_192_avx 120
+		key_expansion_2_192_avx 136
+
+        vaeskeygenassist xmm2, xmm4, 0x20    ; Generate round key 9 and part of round key 10
+        key_expansion_1_192_avx 144
+		key_expansion_2_192_avx 160
+
+        vaeskeygenassist xmm2, xmm4, 0x40    ; Complete round key 10 and generate round key 11
+        key_expansion_1_192_avx 168
+		key_expansion_2_192_avx 184
+
+        vaeskeygenassist xmm2, xmm4, 0x80   ; Generate round key 12
+        key_expansion_1_192_avx 192
+
+;;;  we have already saved the 12 th key, which is pure input on the
+;;;  ENC key path
+     vmovdqu  xmm0, [EXP_ENC_KEYS + 16 * 12]
+     vmovdqu [EXP_DEC_KEYS + 16*0], xmm0
+;;;  generate remaining decrypt keys
+     key_dec_192_avx 1
+     key_dec_192_avx 2
+     key_dec_192_avx 3
+     key_dec_192_avx 4
+     key_dec_192_avx 5
+     key_dec_192_avx 6
+     key_dec_192_avx 7
+     key_dec_192_avx 8
+     key_dec_192_avx 9
+     key_dec_192_avx 10
+     key_dec_192_avx 11
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+	vmovdqu	xmm6, [rsp + 0*16]
+	vmovdqu	xmm7, [rsp + 1*16]
+	add	rsp, 16*2 + 8
+%endif
+
+     ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm
new file mode 100644
index 000000000..9b3eb7688
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm
@@ -0,0 +1,286 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+; Routine to do AES key expansion
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+; Uses the f() function of the aeskeygenassist result
+%macro key_expansion_256_sse 0
+	;; Assumes the xmm3 includes all zeros at this point.
+        pshufd	xmm2, xmm2, 11111111b
+        shufps	xmm3, xmm1, 00010000b
+        pxor	xmm1, xmm3
+        shufps	xmm3, xmm1, 10001100b
+        pxor	xmm1, xmm3
+	pxor	xmm1, xmm2
+%endmacro
+
+; Uses the SubWord function of the aeskeygenassist result
+%macro key_expansion_256_sse_2 0
+	;; Assumes the xmm3 includes all zeros at this point.
+        pshufd	xmm2, xmm2, 10101010b
+        shufps	xmm3, xmm4, 00010000b
+        pxor	xmm4, xmm3
+        shufps	xmm3, xmm4, 10001100b
+        pxor	xmm4, xmm3
+	pxor	xmm4, xmm2
+%endmacro
+
+; Uses the f() function of the aeskeygenassist result
+%macro key_expansion_256_avx 0
+	;; Assumes the xmm3 includes all zeros at this point.
+        vpshufd	xmm2, xmm2, 11111111b
+        vshufps	xmm3, xmm3, xmm1, 00010000b
+        vpxor	xmm1, xmm1, xmm3
+        vshufps	xmm3, xmm3, xmm1, 10001100b
+        vpxor	xmm1, xmm1, xmm3
+	vpxor	xmm1, xmm1, xmm2
+%endmacro
+
+; Uses the SubWord function of the aeskeygenassist result
+%macro key_expansion_256_avx_2 0
+	;; Assumes the xmm3 includes all zeros at this point.
+        vpshufd	xmm2, xmm2, 10101010b
+        vshufps	xmm3, xmm3, xmm4, 00010000b
+        vpxor	xmm4, xmm4, xmm3
+        vshufps	xmm3, xmm3, xmm4, 10001100b
+        vpxor	xmm4, xmm4, xmm3
+	vpxor	xmm4, xmm4, xmm2
+%endmacro
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define KEY		rdi
+%define EXP_ENC_KEYS	rsi
+%define EXP_DEC_KEYS	rdx
+%else
+%define KEY		rcx
+%define EXP_ENC_KEYS	rdx
+%define EXP_DEC_KEYS	r8
+%endif
+
+; void aes_keyexp_256(UINT8 *key,
+;                     UINT8 *enc_exp_keys,
+;                     UINT8 *dec_exp_keys);
+;
+; arg 1: rcx: pointer to key
+; arg 2: rdx: pointer to expanded key array for encrypt
+; arg 3: r8:  pointer to expanded key array for decrypt
+;
+mk_global aes_keyexp_256_sse, function
+aes_keyexp_256_sse:
+	endbranch
+        movdqu	xmm1, [KEY]			; loading the AES key
+	movdqu	[EXP_ENC_KEYS + 16*0], xmm1
+        movdqu	[EXP_DEC_KEYS + 16*14], xmm1	; Storing key in memory
+
+        movdqu	xmm4, [KEY+16]			; loading the AES key
+	movdqu	[EXP_ENC_KEYS + 16*1], xmm4
+        aesimc	xmm0, xmm4
+        movdqu	[EXP_DEC_KEYS + 16*13], xmm0	; Storing key in memory
+
+        pxor xmm3, xmm3				; Required for the key_expansion.
+
+        aeskeygenassist xmm2, xmm4, 0x1		; Generating round key 2
+        key_expansion_256_sse
+	movdqu	[EXP_ENC_KEYS + 16*2], xmm1
+	aesimc	xmm5, xmm1
+	movdqu	[EXP_DEC_KEYS + 16*12], xmm5
+
+        aeskeygenassist xmm2, xmm1, 0x1		; Generating round key 3
+        key_expansion_256_sse_2
+	movdqu	[EXP_ENC_KEYS + 16*3], xmm4
+        aesimc	xmm0, xmm4
+	movdqu	[EXP_DEC_KEYS + 16*11], xmm0
+
+        aeskeygenassist xmm2, xmm4, 0x2		; Generating round key 4
+        key_expansion_256_sse
+	movdqu	[EXP_ENC_KEYS + 16*4], xmm1
+        aesimc	xmm5, xmm1
+	movdqu	[EXP_DEC_KEYS + 16*10], xmm5
+
+        aeskeygenassist xmm2, xmm1, 0x2		; Generating round key 5
+        key_expansion_256_sse_2
+	movdqu	[EXP_ENC_KEYS + 16*5], xmm4
+        aesimc	xmm0, xmm4
+	movdqu	[EXP_DEC_KEYS + 16*9], xmm0
+
+        aeskeygenassist xmm2, xmm4, 0x4		; Generating round key 6
+        key_expansion_256_sse
+	movdqu	[EXP_ENC_KEYS + 16*6], xmm1
+        aesimc	xmm5, xmm1
+	movdqu	[EXP_DEC_KEYS + 16*8], xmm5
+
+        aeskeygenassist xmm2, xmm1, 0x4		; Generating round key 7
+        key_expansion_256_sse_2
+	movdqu	[EXP_ENC_KEYS + 16*7], xmm4
+        aesimc xmm0, xmm4
+	movdqu	[EXP_DEC_KEYS + 16*7], xmm0
+
+        aeskeygenassist xmm2, xmm4, 0x8		; Generating round key 8
+        key_expansion_256_sse
+	movdqu	[EXP_ENC_KEYS + 16*8], xmm1
+        aesimc	xmm5, xmm1
+	movdqu	[EXP_DEC_KEYS + 16*6], xmm5
+
+        aeskeygenassist xmm2, xmm1, 0x8		; Generating round key 9
+        key_expansion_256_sse_2
+	movdqu	[EXP_ENC_KEYS + 16*9], xmm4
+        aesimc	xmm0, xmm4
+	movdqu	[EXP_DEC_KEYS + 16*5], xmm0
+
+        aeskeygenassist xmm2, xmm4, 0x10	; Generating round key 10
+        key_expansion_256_sse
+	movdqu	[EXP_ENC_KEYS + 16*10], xmm1
+        aesimc	xmm5, xmm1
+	movdqu	[EXP_DEC_KEYS + 16*4], xmm5
+
+        aeskeygenassist xmm2, xmm1, 0x10	; Generating round key 11
+        key_expansion_256_sse_2
+	movdqu	[EXP_ENC_KEYS + 16*11], xmm4
+        aesimc	xmm0, xmm4
+	movdqu	[EXP_DEC_KEYS + 16*3], xmm0
+
+        aeskeygenassist xmm2, xmm4, 0x20	; Generating round key 12
+        key_expansion_256_sse
+	movdqu	[EXP_ENC_KEYS + 16*12], xmm1
+        aesimc	xmm5, xmm1
+	movdqu	[EXP_DEC_KEYS + 16*2], xmm5
+
+        aeskeygenassist xmm2, xmm1, 0x20	; Generating round key 13
+        key_expansion_256_sse_2
+	movdqu	[EXP_ENC_KEYS + 16*13], xmm4
+        aesimc	xmm0, xmm4
+	movdqu	[EXP_DEC_KEYS + 16*1], xmm0
+
+        aeskeygenassist xmm2, xmm4, 0x40	; Generating round key 14
+        key_expansion_256_sse
+	movdqu	[EXP_ENC_KEYS + 16*14], xmm1
+	movdqu	[EXP_DEC_KEYS + 16*0], xmm1
+
+	ret
+
+
+mk_global aes_keyexp_256_avx, function
+aes_keyexp_256_avx:
+	endbranch
+        vmovdqu	xmm1, [KEY]			; loading the AES key
+	vmovdqu	[EXP_ENC_KEYS + 16*0], xmm1
+        vmovdqu	[EXP_DEC_KEYS + 16*14], xmm1	; Storing key in memory
+
+        vmovdqu	xmm4, [KEY+16]			; loading the AES key
+	vmovdqu	[EXP_ENC_KEYS + 16*1], xmm4
+        vaesimc	xmm0, xmm4
+        vmovdqu	[EXP_DEC_KEYS + 16*13], xmm0	; Storing key in memory
+
+        vpxor xmm3, xmm3, xmm3			; Required for the key_expansion.
+
+        vaeskeygenassist xmm2, xmm4, 0x1		; Generating round key 2
+        key_expansion_256_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*2], xmm1
+	vaesimc	xmm5, xmm1
+	vmovdqu	[EXP_DEC_KEYS + 16*12], xmm5
+
+        vaeskeygenassist xmm2, xmm1, 0x1		; Generating round key 3
+        key_expansion_256_avx_2
+	vmovdqu	[EXP_ENC_KEYS + 16*3], xmm4
+        vaesimc	xmm0, xmm4
+	vmovdqu	[EXP_DEC_KEYS + 16*11], xmm0
+
+        vaeskeygenassist xmm2, xmm4, 0x2		; Generating round key 4
+        key_expansion_256_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*4], xmm1
+        vaesimc	xmm5, xmm1
+	vmovdqu	[EXP_DEC_KEYS + 16*10], xmm5
+
+        vaeskeygenassist xmm2, xmm1, 0x2		; Generating round key 5
+        key_expansion_256_avx_2
+	vmovdqu	[EXP_ENC_KEYS + 16*5], xmm4
+        vaesimc	xmm0, xmm4
+	vmovdqu	[EXP_DEC_KEYS + 16*9], xmm0
+
+        vaeskeygenassist xmm2, xmm4, 0x4		; Generating round key 6
+        key_expansion_256_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*6], xmm1
+        vaesimc	xmm5, xmm1
+	vmovdqu	[EXP_DEC_KEYS + 16*8], xmm5
+
+        vaeskeygenassist xmm2, xmm1, 0x4		; Generating round key 7
+        key_expansion_256_avx_2
+	vmovdqu	[EXP_ENC_KEYS + 16*7], xmm4
+        vaesimc xmm0, xmm4
+	vmovdqu	[EXP_DEC_KEYS + 16*7], xmm0
+
+        vaeskeygenassist xmm2, xmm4, 0x8		; Generating round key 8
+        key_expansion_256_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*8], xmm1
+        vaesimc	xmm5, xmm1
+	vmovdqu	[EXP_DEC_KEYS + 16*6], xmm5
+
+        vaeskeygenassist xmm2, xmm1, 0x8		; Generating round key 9
+        key_expansion_256_avx_2
+	vmovdqu	[EXP_ENC_KEYS + 16*9], xmm4
+        vaesimc	xmm0, xmm4
+	vmovdqu	[EXP_DEC_KEYS + 16*5], xmm0
+
+        vaeskeygenassist xmm2, xmm4, 0x10	; Generating round key 10
+        key_expansion_256_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*10], xmm1
+        vaesimc	xmm5, xmm1
+	vmovdqu	[EXP_DEC_KEYS + 16*4], xmm5
+
+        vaeskeygenassist xmm2, xmm1, 0x10	; Generating round key 11
+        key_expansion_256_avx_2
+	vmovdqu	[EXP_ENC_KEYS + 16*11], xmm4
+        vaesimc	xmm0, xmm4
+	vmovdqu	[EXP_DEC_KEYS + 16*3], xmm0
+
+        vaeskeygenassist xmm2, xmm4, 0x20	; Generating round key 12
+        key_expansion_256_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*12], xmm1
+        vaesimc	xmm5, xmm1
+	vmovdqu	[EXP_DEC_KEYS + 16*2], xmm5
+
+        vaeskeygenassist xmm2, xmm1, 0x20	; Generating round key 13
+        key_expansion_256_avx_2
+	vmovdqu	[EXP_ENC_KEYS + 16*13], xmm4
+        vaesimc	xmm0, xmm4
+	vmovdqu	[EXP_DEC_KEYS + 16*1], xmm0
+
+        vaeskeygenassist xmm2, xmm4, 0x40	; Generating round key 14
+        key_expansion_256_avx
+	vmovdqu	[EXP_ENC_KEYS + 16*14], xmm1
+	vmovdqu	[EXP_DEC_KEYS + 16*0], xmm1
+
+	ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm
new file mode 100644
index 000000000..045649a64
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm
@@ -0,0 +1,68 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%include "reg_sizes.asm"
+
+extern aes_keyexp_128_sse
+extern aes_keyexp_128_avx
+extern aes_keyexp_128_enc_sse
+extern aes_keyexp_128_enc_avx
+
+extern aes_keyexp_192_sse
+extern aes_keyexp_192_avx
+
+extern aes_keyexp_256_sse
+extern aes_keyexp_256_avx
+
+%include "multibinary.asm"
+
+
+;;;;
+; instantiate aes_keyexp_128 interfaces
+;;;;
+mbin_interface     aes_keyexp_128
+mbin_dispatch_init aes_keyexp_128, aes_keyexp_128_sse, aes_keyexp_128_avx, aes_keyexp_128_avx
+
+mbin_interface     aes_keyexp_128_enc
+mbin_dispatch_init aes_keyexp_128_enc, aes_keyexp_128_enc_sse, aes_keyexp_128_enc_avx, aes_keyexp_128_enc_avx
+
+mbin_interface     aes_keyexp_192
+mbin_dispatch_init aes_keyexp_192, aes_keyexp_192_sse, aes_keyexp_192_avx, aes_keyexp_192_avx
+
+mbin_interface     aes_keyexp_256
+mbin_dispatch_init aes_keyexp_256, aes_keyexp_256_sse, aes_keyexp_256_avx, aes_keyexp_256_avx
+
+section .text
+;;;       func            	core, ver, snum
+slversion aes_keyexp_128,	00,   01,  02a1
+slversion aes_keyexp_192,	00,   01,  02a2
+slversion aes_keyexp_256,	00,   01,  02a3
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h b/src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h
new file mode 100644
index 000000000..80c6e1e87
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h
@@ -0,0 +1,302 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef AES_OSSL_HELPER_H_
+#define AES_OSSL_HELPER_H_
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+#include <openssl/evp.h>
+
+static inline
+    int openssl_aes_128_cbc_dec(uint8_t * key, uint8_t * iv,
+				int len, uint8_t * cyphertext, uint8_t * plaintext)
+{
+	int outlen = 0, tmplen = 0;
+	EVP_CIPHER_CTX *ctx;
+	ctx = EVP_CIPHER_CTX_new();
+
+	if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_cbc(), NULL, key, iv))
+		printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_128_cbc\n");
+	if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+		printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+	if (!EVP_DecryptUpdate(ctx, plaintext, &outlen, (uint8_t const *)cyphertext, len))
+		printf("\n ERROR!! EVP_DecryptUpdate - EVP_aes_128_cbc\n");
+	if (!EVP_DecryptFinal_ex(ctx, &plaintext[outlen], &tmplen))
+		printf("\n ERROR!! EVP_DecryptFinal_ex - EVP_aes_128_cbc %x, %x, %x\n", len,
+		       outlen, tmplen);
+
+	EVP_CIPHER_CTX_free(ctx);
+	return tmplen;
+}
+
+static inline
+    int openssl_aes_128_cbc_enc(uint8_t * key, uint8_t * iv,
+				int len, uint8_t * plaintext, uint8_t * cyphertext)
+{
+	int outlen, tmplen;
+	EVP_CIPHER_CTX *ctx;
+	ctx = EVP_CIPHER_CTX_new();
+
+	if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_cbc(), NULL, key, iv))
+		printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_128_cbc\n");
+	if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+		printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+	if (!EVP_EncryptUpdate
+	    (ctx, cyphertext, &outlen, (const unsigned char *)plaintext, len))
+		printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_128_cbc\n");
+	if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen))
+		printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_128_cbc\n");
+
+	EVP_CIPHER_CTX_free(ctx);
+	return tmplen;
+}
+
+static inline
+    int openssl_aes_192_cbc_dec(uint8_t * key, uint8_t * iv,
+				int len, uint8_t * cyphertext, uint8_t * plaintext)
+{
+	int outlen = 0, tmplen = 0;
+	EVP_CIPHER_CTX *ctx;
+	ctx = EVP_CIPHER_CTX_new();
+
+	if (!EVP_DecryptInit_ex(ctx, EVP_aes_192_cbc(), NULL, key, iv))
+		printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_192_cbc\n");
+	if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+		printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+	if (!EVP_DecryptUpdate
+	    (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len))
+		printf("\n ERROR!! EVP_DecryptUpdate - EVP_aes_192_cbc\n");
+	if (!EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen))
+		printf("\n ERROR!! EVP_DecryptFinal_ex - EVP_aes_192_cbc \n");
+
+	EVP_CIPHER_CTX_free(ctx);
+	return 0;
+}
+
+static inline
+    int openssl_aes_192_cbc_enc(uint8_t * key, uint8_t * iv,
+				int len, uint8_t * plaintext, uint8_t * cyphertext)
+{
+	int outlen, tmplen;
+	EVP_CIPHER_CTX *ctx;
+	ctx = EVP_CIPHER_CTX_new();
+
+	if (!EVP_EncryptInit_ex(ctx, EVP_aes_192_cbc(), NULL, key, iv))
+		printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_192_cbc\n");
+	if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+		printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+	if (!EVP_EncryptUpdate
+	    (ctx, cyphertext, &outlen, (const unsigned char *)plaintext, len))
+		printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_192_cbc\n");
+	if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen))
+		printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_192_cbc\n");
+
+	EVP_CIPHER_CTX_free(ctx);
+	return 0;
+}
+
+static inline
+    int openssl_aes_256_cbc_dec(uint8_t * key, uint8_t * iv,
+				int len, uint8_t * cyphertext, uint8_t * plaintext)
+{
+	int outlen = 0, tmplen = 0;
+	EVP_CIPHER_CTX *ctx;
+	ctx = EVP_CIPHER_CTX_new();
+
+	if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_cbc(), NULL, key, iv))
+		printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_256_cbc\n");
+	if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+		printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+	if (!EVP_DecryptUpdate
+	    (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len))
+		printf("\n ERROR!! EVP_DecryptUpdate - EVP_aes_256_cbc\n");
+	if (!EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen))
+		printf("\n ERROR!! EVP_DecryptFinal_ex - EVP_aes_256_cbc %x,%x\n", outlen,
+		       tmplen);
+
+	EVP_CIPHER_CTX_free(ctx);
+	return 0;
+}
+
+static inline
+    int openssl_aes_256_cbc_enc(uint8_t * key, uint8_t * iv,
+				int len, uint8_t * plaintext, uint8_t * cyphertext)
+{
+	int outlen, tmplen;
+	EVP_CIPHER_CTX *ctx;
+	ctx = EVP_CIPHER_CTX_new();
+
+	if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_cbc(), NULL, key, iv))
+		printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_256_cbc\n");
+	if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+		printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+	if (!EVP_EncryptUpdate
+	    (ctx, cyphertext, &outlen, (const unsigned char *)plaintext, len))
+		printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_256_cbc\n");
+	if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen))
+		printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_256_cbc\n");
+
+	EVP_CIPHER_CTX_free(ctx);
+	return 0;
+}
+
+static inline
+    int openssl_aes_gcm_dec(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad,
+			    int aad_len, uint8_t * tag, int tag_len, uint8_t * cyphertext,
+			    int len, uint8_t * plaintext)
+{
+	int outlen = 0, tmplen = len, ret;
+	EVP_CIPHER_CTX *ctx;
+	ctx = EVP_CIPHER_CTX_new();
+
+	if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_gcm(), NULL, NULL, NULL))
+		printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_128_gcm\n");
+	if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag))
+		printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n");
+	if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL))
+		printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n");
+	if (!EVP_DecryptInit_ex(ctx, NULL, NULL, key, iv))
+		printf("\n ERROR!! EVP_DecryptInit_ex - key init\n");
+	if (!EVP_DecryptUpdate(ctx, NULL, &outlen, aad, aad_len))
+		printf("\n ERROR!! EVP_DecryptUpdate - aad data setup\n");
+	if (!EVP_DecryptUpdate
+	    (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len))
+		printf("\n ERROR!! EVP_DecryptUpdate - PT->CT\n");
+	if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag))
+		printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n");
+
+	ret = EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen);
+	if (0 < ret) {
+		tmplen += outlen;
+	} else {
+		//Authentication failed mismatched key, ADD or tag
+		tmplen = -1;
+	}
+
+	EVP_CIPHER_CTX_free(ctx);
+	return tmplen;
+}
+
+static inline
+    int openssl_aes_gcm_enc(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad,
+			    int aad_len, uint8_t * tag, int tag_len, uint8_t * plaintext,
+			    int len, uint8_t * cyphertext)
+{
+	int outlen, tmplen;
+	EVP_CIPHER_CTX *ctx;
+	ctx = EVP_CIPHER_CTX_new();
+
+	//printf("ivl:%x addl:%x tagl:%x ptl:%x\n", iv_len, aad_len, tag_len, len);
+	if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_gcm(), NULL, NULL, NULL))
+		printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_128_cbc\n");
+	if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL))
+		printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n");
+	if (!EVP_EncryptInit_ex(ctx, NULL, NULL, key, iv))
+		printf("\n ERROR!! EVP_EncryptInit_ex - init\n");
+	if (!EVP_EncryptUpdate(ctx, NULL, &outlen, aad, aad_len))
+		printf("\n ERROR!! EVP_EncryptUpdate - aad insert\n");
+	if (!EVP_EncryptUpdate(ctx, cyphertext, &outlen, (const uint8_t *)plaintext, len))
+		printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_128_cbc\n");
+	if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen))
+		printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_128_cbc\n");
+	if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_GET_TAG, tag_len, tag))
+		printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - tag \n");
+
+	EVP_CIPHER_CTX_free(ctx);
+	return tmplen;
+}
+
+static inline
+    int openssl_aes_256_gcm_dec(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad,
+				int aad_len, uint8_t * tag, int tag_len, uint8_t * cyphertext,
+				int len, uint8_t * plaintext)
+{
+	int outlen = 0, tmplen = len, ret;
+	EVP_CIPHER_CTX *ctx;
+	ctx = EVP_CIPHER_CTX_new();
+
+	if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_gcm(), NULL, NULL, NULL))
+		printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_128_gcm\n");
+	if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag))
+		printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n");
+	if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL))
+		printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n");
+	if (!EVP_DecryptInit_ex(ctx, NULL, NULL, key, iv))
+		printf("\n ERROR!! EVP_DecryptInit_ex - key init\n");
+	if (!EVP_DecryptUpdate(ctx, NULL, &outlen, aad, aad_len))
+		printf("\n ERROR!! EVP_DecryptUpdate - aad data setup\n");
+	if (!EVP_DecryptUpdate
+	    (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len))
+		printf("\n ERROR!! EVP_DecryptUpdate - PT->CT\n");
+	if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag))
+		printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n");
+	ret = EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen);
+	if (0 < ret) {
+		tmplen += outlen;
+	} else {
+		//Authentication failed mismatched key, ADD or tag
+		tmplen = -1;
+	}
+
+	EVP_CIPHER_CTX_free(ctx);
+	return tmplen;
+}
+
+static inline
+    int openssl_aes_256_gcm_enc(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad,
+				int aad_len, uint8_t * tag, int tag_len, uint8_t * plaintext,
+				int len, uint8_t * cyphertext)
+{
+	int outlen, tmplen;
+	EVP_CIPHER_CTX *ctx;
+	ctx = EVP_CIPHER_CTX_new();
+
+	if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_gcm(), NULL, NULL, NULL))
+		printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_128_cbc\n");
+	if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL))
+		printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n");
+	if (!EVP_EncryptInit_ex(ctx, NULL, NULL, key, iv))
+		printf("\n ERROR!! EVP_EncryptInit_ex - init\n");
+	if (!EVP_EncryptUpdate(ctx, NULL, &outlen, aad, aad_len))
+		printf("\n ERROR!! EVP_EncryptUpdate - aad insert\n");
+	if (!EVP_EncryptUpdate(ctx, cyphertext, &outlen, (const uint8_t *)plaintext, len))
+		printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_128_cbc\n");
+	if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen))
+		printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_128_cbc\n");
+	if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_GET_TAG, tag_len, tag))
+		printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - tag \n");
+
+	EVP_CIPHER_CTX_free(ctx);
+	return tmplen;
+}
+
+#endif /* AES_OSSL_HELPER_H_ */
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c
new file mode 100644
index 000000000..5dc898992
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c
@@ -0,0 +1,143 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>		// for rand
+#include <string.h>		// for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+#include <openssl/evp.h>
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+# define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+# define TEST_LEN     (2 * GT_L3_CACHE)
+# define TEST_LOOPS   50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+		  int n)
+{
+	int i;
+	for (i = 0; i < 16; i++) {
+		*k1++ = rand();
+		*k2++ = rand();
+		*k3++ = rand();
+	}
+	for (i = 0; i < n; i++)
+		*p++ = rand();
+
+}
+
+static inline
+    int openssl_aes_128_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+				unsigned char *ct, unsigned char *dt)
+{
+	int outlen, tmplen;
+	if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv))
+		printf("\n ERROR!! \n");
+	if (!EVP_DecryptUpdate(ctx, dt, &outlen, (const unsigned char *)ct, TEST_LEN))
+		printf("\n ERROR!! \n");
+	if (!EVP_DecryptFinal_ex(ctx, dt + outlen, &tmplen))
+		printf("\n ERROR!! \n");
+
+	return 0;
+}
+
+int main(void)
+{
+	int i;
+
+	unsigned char key1[16], key2[16], tinit[16];
+	unsigned char *pt, *ct, *dt, *refdt;
+	unsigned char keyssl[32];	/* SSL takes both keys together */
+	struct perf start, stop;
+
+	/* Initialise our cipher context, which can use same input vectors */
+	EVP_CIPHER_CTX *ctx;
+	ctx = EVP_CIPHER_CTX_new();
+
+	printf("aes_xts_128_dec_perf:\n");
+
+	pt = malloc(TEST_LEN);
+	ct = malloc(TEST_LEN);
+	dt = malloc(TEST_LEN);
+	refdt = malloc(TEST_LEN);
+
+	if (NULL == pt || NULL == ct || NULL == dt || NULL == refdt) {
+		printf("malloc of testsize failed\n");
+		return -1;
+	}
+
+	mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+	/* Set up key for the SSL engine */
+	for (i = 0; i < 16; i++) {
+		keyssl[i] = key1[i];
+		keyssl[i + 16] = key2[i];
+	}
+
+	/* Encrypt and compare decrypted output */
+	XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+	XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+	openssl_aes_128_xts_dec(ctx, keyssl, tinit, ct, refdt);
+	if (memcmp(dt, refdt, TEST_LEN)) {
+		printf("ISA-L and OpenSSL results don't match\n");
+		return -1;
+	}
+
+	/* Time ISA-L decryption */
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++)
+		XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+	perf_stop(&stop);
+	printf("aes_xts_128_dec" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	/* Time OpenSSL decryption */
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++)
+		openssl_aes_128_xts_dec(ctx, keyssl, tinit, ct, refdt);
+	perf_stop(&stop);
+	printf("aes_xts_128_openssl_dec" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	EVP_CIPHER_CTX_free(ctx);
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c
new file mode 100644
index 000000000..fdaa8a9bb
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c
@@ -0,0 +1,125 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>		// for rand
+#include <string.h>		// for memcmp
+#include "aes_xts.h"
+#include "aes_keyexp.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   3000000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     (2 * GT_L3_CACHE)
+#  define TEST_LOOPS   400
+#  define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+		  int n)
+{
+	int i;
+	for (i = 0; i < 16; i++) {
+		*k1++ = rand();
+		*k2++ = rand();
+		*k3++ = rand();
+	}
+	for (i = 0; i < n; i++)
+		*p++ = rand();
+
+}
+
+int main(void)
+{
+	int i;
+
+	unsigned char key1[16], key2[16], tinit[16];
+	unsigned char *pt, *ct, *dt;
+	uint8_t expkey1_enc[16 * 11], expkey2_enc[16 * 11];
+	uint8_t expkey1_dec[16 * 11], null_key[16 * 11];
+
+	printf("aes_xts_128_dec_perf:\n");
+
+	pt = malloc(TEST_LEN);
+	ct = malloc(TEST_LEN);
+	dt = malloc(TEST_LEN);
+
+	if (NULL == pt || NULL == ct || NULL == dt) {
+		printf("malloc of testsize failed\n");
+		return -1;
+	}
+
+	/* Decode perf test */
+
+	mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+	XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+	XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+
+	struct perf start, stop;
+
+	perf_start(&start);
+
+	for (i = 0; i < TEST_LOOPS; i++) {
+		XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+	}
+
+	perf_stop(&stop);
+
+	printf("aes_xts_128_dec" TEST_TYPE_STR ":              ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	/* Expanded keys perf test */
+
+	aes_keyexp_128(key1, expkey1_enc, expkey1_dec);
+	aes_keyexp_128(key2, expkey2_enc, null_key);
+	XTS_AES_128_dec_expanded_key(expkey2_enc, expkey1_dec, tinit, TEST_LEN, ct, pt);
+
+	perf_start(&start);
+
+	for (i = 0; i < TEST_LOOPS; i++) {
+		XTS_AES_128_dec_expanded_key(expkey2_enc, expkey1_dec, tinit, TEST_LEN, ct,
+					     pt);
+	}
+
+	perf_stop(&stop);
+
+	printf("aes_xts_128_dec_expanded_key" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c
new file mode 100644
index 000000000..69ae2e60e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c
@@ -0,0 +1,144 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>		// for rand
+#include <string.h>		// for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+#include <openssl/evp.h>
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+# define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+# define TEST_LEN     (2 * GT_L3_CACHE)
+# define TEST_LOOPS   50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts128_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3,
+			 unsigned char *p, int n)
+{
+	int i;
+	for (i = 0; i < 16; i++) {
+		*k1++ = rand();
+		*k2++ = rand();
+		*k3++ = rand();
+	}
+	for (i = 0; i < n; i++)
+		*p++ = rand();
+
+}
+
+static inline
+    int openssl_aes_128_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+				int len, unsigned char *pt, unsigned char *ct)
+{
+	int outlen, tmplen;
+	if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv))
+		printf("\n ERROR!! \n");
+	if (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+		printf("\n ERROR!! \n");
+	if (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))
+		printf("\n ERROR!! \n");
+
+	return 0;
+}
+
+int main(void)
+{
+	int i;
+
+	unsigned char key1[16], key2[16], tinit[16];
+	unsigned char *pt, *ct, *refct;
+	struct perf start, stop;
+	unsigned char keyssl[32];	/* SSL takes both keys together */
+
+	/* Initialise our cipher context, which can use same input vectors */
+	EVP_CIPHER_CTX *ctx;
+	ctx = EVP_CIPHER_CTX_new();
+
+	printf("aes_xts_128_enc_perf:\n");
+
+	pt = malloc(TEST_LEN);
+	ct = malloc(TEST_LEN);
+	refct = malloc(TEST_LEN);
+
+	if (NULL == pt || NULL == ct || NULL == refct) {
+		printf("malloc of testsize failed\n");
+		return -1;
+	}
+
+	xts128_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+	/* Set up key for the SSL engine */
+	for (i = 0; i < 16; i++) {
+		keyssl[i] = key1[i];
+		keyssl[i + 16] = key2[i];
+	}
+
+	/* Encrypt and compare output */
+	XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+	openssl_aes_128_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct);
+	if (memcmp(ct, refct, TEST_LEN)) {
+		printf("ISA-L and OpenSSL results don't match\n");
+		return -1;
+	}
+
+	/* Time ISA-L encryption */
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++)
+		XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+	perf_stop(&stop);
+
+	printf("aes_xts_128_enc" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	/* Time OpenSSL encryption */
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++)
+		openssl_aes_128_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct);
+	perf_stop(&stop);
+
+	printf("aes_xts_128_openssl_enc" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	EVP_CIPHER_CTX_free(ctx);
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c
new file mode 100644
index 000000000..166e46652
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c
@@ -0,0 +1,123 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>		// for rand
+#include <string.h>		// for memcmp
+#include "aes_xts.h"
+#include "aes_keyexp.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   3000000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     (2 * GT_L3_CACHE)
+#  define TEST_LOOPS   400
+#  define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+		  int n)
+{
+	int i;
+	for (i = 0; i < 16; i++) {
+		*k1++ = rand();
+		*k2++ = rand();
+		*k3++ = rand();
+	}
+	for (i = 0; i < n; i++)
+		*p++ = rand();
+
+}
+
+int main(void)
+{
+	int i;
+
+	unsigned char key1[16], key2[16], tinit[16];
+	unsigned char *pt, *ct;
+	uint8_t expkey1_enc[16 * 11], expkey2_enc[16 * 11];
+	uint8_t expkey1_dec[16 * 11], null_key[16 * 11];
+
+	printf("aes_xts_128_enc_perf:\n");
+
+	pt = malloc(TEST_LEN);
+	ct = malloc(TEST_LEN);
+
+	if (NULL == pt || NULL == ct) {
+		printf("malloc of testsize failed\n");
+		return -1;
+	}
+
+	/* Encode perf test */
+
+	mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+	XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+
+	struct perf start, stop;
+
+	perf_start(&start);
+
+	for (i = 0; i < TEST_LOOPS; i++) {
+		XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+	}
+
+	perf_stop(&stop);
+
+	printf("aes_xts_128_enc" TEST_TYPE_STR ":              ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	/* Expanded keys perf test */
+
+	aes_keyexp_128(key1, expkey1_enc, expkey1_dec);
+	aes_keyexp_128(key2, expkey2_enc, null_key);
+	XTS_AES_128_enc_expanded_key(expkey2_enc, expkey1_enc, tinit, TEST_LEN, pt, ct);
+
+	perf_start(&start);
+
+	for (i = 0; i < TEST_LOOPS; i++) {
+		XTS_AES_128_enc_expanded_key(expkey2_enc, expkey1_enc, tinit, TEST_LEN, pt,
+					     ct);
+	}
+
+	perf_stop(&stop);
+
+	printf("aes_xts_128_enc_expanded_key" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c
new file mode 100644
index 000000000..27599f0ca
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c
@@ -0,0 +1,116 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <aes_keyexp.h>
+#include "xts_128_vect.h"
+
+int main(void)
+{
+
+	// Temporary array for the calculated vectors
+	uint8_t *ct_test;
+	uint8_t *pt_test;
+	// Arrays for expanded keys, null_key is a dummy vector (decrypt key not
+	// needed for the tweak part of the decryption)
+	uint8_t expkey1_enc[16 * 11], expkey2_enc[16 * 11];
+	uint8_t expkey1_dec[16 * 11], null_key[16 * 11];
+
+	int i, j;
+
+	// --- Encryption test ---
+
+	// Loop over the vectors
+	for (i = 0; i < NVEC; i++) {
+
+		// Allocate space for the calculated ciphertext
+		ct_test = malloc(vlist[i].ptlen);
+		if (ct_test == NULL) {
+			printf("Can't allocate ciphertext memory\n");
+			return -1;
+		}
+		// Pre-expand keys (will only use the encryption ones here)
+		aes_keyexp_128(vlist[i].key1, expkey1_enc, expkey1_dec);
+		aes_keyexp_128(vlist[i].key2, expkey2_enc, null_key);
+
+		XTS_AES_128_enc_expanded_key(expkey2_enc, expkey1_enc, vlist[i].TW,
+					     vlist[i].ptlen, vlist[i].PTX, ct_test);
+
+		// Carry out comparison of the calculated ciphertext with
+		// the reference
+		for (j = 0; j < vlist[i].ptlen; j++) {
+
+			if (ct_test[j] != vlist[i].CTX[j]) {
+				// Vectors 1-10 and 15-19 are for the 128 bit code
+				printf("\nXTS_AES_128_enc: Vector %d: ",
+				       i < 9 ? i + 1 : i + 6);
+				printf("failed at byte %d! \n", j);
+				return -1;
+			}
+		}
+		printf(".");
+	}
+
+	// --- Decryption test ---
+
+	// Loop over the vectors
+	for (i = 0; i < NVEC; i++) {
+
+		// Allocate space for the calculated ciphertext
+		pt_test = malloc(vlist[i].ptlen);
+		if (pt_test == NULL) {
+			printf("Can't allocate plaintext memory\n");
+			return -1;
+		}
+		// Pre-expand keys for the decryption
+		aes_keyexp_128(vlist[i].key1, expkey1_enc, expkey1_dec);
+		aes_keyexp_128(vlist[i].key2, expkey2_enc, null_key);
+
+		// Note, encryption key is re-used for the tweak decryption step
+		XTS_AES_128_dec_expanded_key(expkey2_enc, expkey1_dec, vlist[i].TW,
+					     vlist[i].ptlen, vlist[i].CTX, pt_test);
+
+		// Carry out comparison of the calculated ciphertext with
+		// the reference
+		for (j = 0; j < vlist[i].ptlen; j++) {
+
+			if (pt_test[j] != vlist[i].PTX[j]) {
+				printf("\nXTS_AES_128_enc: Vector %d: ",
+				       i < 9 ? i + 1 : i + 6);
+				printf(" failed at byte %d! \n", j);
+				return -1;
+			}
+		}
+		printf(".");
+	}
+	printf("Pass\n");
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c
new file mode 100644
index 000000000..4753d6778
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c
@@ -0,0 +1,247 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>		// for rand
+#include <string.h>		// for memcmp
+#include <aes_xts.h>
+#include <aes_keyexp.h>
+
+#define TEST_LEN  (1024*1024)
+#define TEST_SIZE (4096)
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+		  int n)
+{
+	int i;
+	for (i = 0; i < 16; i++) {
+		*k1++ = rand();
+		*k2++ = rand();
+		*k3++ = rand();
+	}
+	for (i = 0; i < n; i++)
+		*p++ = rand();
+
+}
+
+int main(void)
+{
+	int t, n;
+
+	unsigned char key1[16], key2[16], tinit[16];
+	unsigned char *pt, *ct, *dt;
+
+	int align, size, min_size;
+	unsigned char *efence_pt;
+	unsigned char *efence_ct;
+	unsigned char *efence_dt;
+
+	unsigned char *origin_pt;
+	unsigned char *origin_ct;
+	unsigned char *origin_dt;
+
+	unsigned char key1_exp_enc[16 * 11], key1_exp_dec[16 * 11];
+	unsigned char key2_exp_tw[16 * 11];
+	int i;
+
+	printf("aes_xts_128 enc/dec rand test, %d sets of %d max: ", RANDOMS, TEST_LEN);
+	pt = malloc(TEST_LEN);
+	ct = malloc(TEST_LEN);
+	dt = malloc(TEST_LEN);
+
+	if (NULL == pt || NULL == ct || NULL == dt) {
+		printf("malloc of testsize failed\n");
+		return -1;
+	}
+
+	mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+	XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+	XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+
+	if (memcmp(pt, dt, TEST_LEN)) {
+		printf("fail\n");
+		return -1;
+	}
+	putchar('.');
+
+	// Do tests with random data, keys and message size
+	for (t = 0; t < RANDOMS; t++) {
+		n = rand() % (TEST_LEN);
+		if (n < 17)
+			continue;
+
+		mk_rand_data(key1, key2, tinit, pt, n);
+		XTS_AES_128_enc(key2, key1, tinit, n, pt, ct);
+		XTS_AES_128_dec(key2, key1, tinit, n, ct, dt);
+
+		if (memcmp(pt, dt, n)) {
+			printf("fail rand %d, size %d\n", t, n);
+			return -1;
+		}
+		putchar('.');
+		fflush(0);
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	align = 1;
+	min_size = 16;
+	for (size = 0; size <= TEST_SIZE - min_size; size += align) {
+
+		// Line up TEST_SIZE from end
+		efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+		efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+		efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+		mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+		XTS_AES_128_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct);
+		XTS_AES_128_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt);
+
+		if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) {
+			printf("efence: fail size %d\n", TEST_SIZE - size);
+			return -1;
+		}
+		putchar('.');
+		fflush(0);
+	}
+
+	origin_pt = malloc(TEST_LEN);
+	origin_ct = malloc(TEST_LEN);
+	origin_dt = malloc(TEST_LEN);
+	if (NULL == origin_pt || NULL == origin_ct || NULL == origin_dt) {
+		printf("malloc of testsize failed\n");
+		return -1;
+	}
+	// For data lengths from 0 to 15 bytes, the functions return without any error
+	// codes, without reading or writing any data.
+	for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) {
+
+		// Line up TEST_SIZE from end
+		efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+		efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+		efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+		mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+		memcpy(efence_ct, efence_pt, TEST_SIZE - size);
+		memcpy(efence_dt, efence_pt, TEST_SIZE - size);
+		memcpy(origin_pt, efence_pt, TEST_SIZE - size);
+		memcpy(origin_ct, efence_ct, TEST_SIZE - size);
+		memcpy(origin_dt, efence_dt, TEST_SIZE - size);
+
+		XTS_AES_128_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct);
+		XTS_AES_128_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt);
+
+		if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) {
+			printf("efence_pt: fail size %d\n", TEST_SIZE - size);
+			return -1;
+		}
+		if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) {
+			printf("efence_ct: fail size %d\n", TEST_SIZE - size);
+			return -1;
+		}
+		if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) {
+			printf("efence_dt: fail size %d\n", TEST_SIZE - size);
+			return -1;
+		}
+		putchar('.');
+		fflush(0);
+	}
+
+	for (i = 0; i < 16 * 11; i++) {
+		key2_exp_tw[i] = rand();
+	}
+
+	for (size = 0; size <= TEST_SIZE - min_size; size += align) {
+
+		// Line up TEST_SIZE from end
+		efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+		efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+		efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+		mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+		aes_keyexp_128(key1, key1_exp_enc, key1_exp_dec);
+
+		XTS_AES_128_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit,
+					     TEST_SIZE - size, efence_pt, efence_ct);
+		XTS_AES_128_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit,
+					     TEST_SIZE - size, efence_ct, efence_dt);
+
+		if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) {
+			printf("efence_expanded_key: fail size %d\n", TEST_SIZE - size);
+			return -1;
+		}
+		putchar('.');
+		fflush(0);
+	}
+
+	// For data lengths from 0 to 15 bytes, the functions return without any error
+	// codes, without reading or writing any data.
+	for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) {
+
+		// Line up TEST_SIZE from end
+		efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+		efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+		efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+		mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+		memcpy(efence_ct, efence_pt, TEST_SIZE - size);
+		memcpy(efence_dt, efence_pt, TEST_SIZE - size);
+		memcpy(origin_pt, efence_pt, TEST_SIZE - size);
+		memcpy(origin_ct, efence_ct, TEST_SIZE - size);
+		memcpy(origin_dt, efence_dt, TEST_SIZE - size);
+
+		aes_keyexp_128(key1, key1_exp_enc, key1_exp_dec);
+
+		XTS_AES_128_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit,
+					     TEST_SIZE - size, efence_pt, efence_ct);
+		XTS_AES_128_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit,
+					     TEST_SIZE - size, efence_ct, efence_dt);
+
+		if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) {
+			printf("efence_expanded_key for pt: fail size %d\n", TEST_SIZE - size);
+			return -1;
+		}
+		if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) {
+			printf("efence_expanded_key for ct: fail size %d\n", TEST_SIZE - size);
+			return -1;
+		}
+		if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) {
+			printf("efence_expanded_key for dt: fail size %d\n", TEST_SIZE - size);
+			return -1;
+		}
+		putchar('.');
+		fflush(0);
+	}
+
+	printf("Pass\n");
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c
new file mode 100644
index 000000000..065b84465
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c
@@ -0,0 +1,271 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aes_xts.h"
+#include <stdlib.h>
+#include <openssl/evp.h>
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#ifndef RANDOMS
+# define RANDOMS  128
+#endif
+#define TEST_LOOPS  128
+#define TEST_LEN    (1024*1024)
+#define LENGTH_SCAN (2*1024)
+
+/* Generates random data for keys, tweak and plaintext */
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+		  int n)
+{
+	int i;
+	for (i = 0; i < 16; i++) {
+		*k1++ = rand();
+		*k2++ = rand();
+		*k3++ = rand();
+	}
+	for (i = 0; i < n; i++)
+		*p++ = rand();
+
+}
+
+/* Wrapper for OpenSSL EVP AES-XTS 128 encryption */
+static inline
+    int openssl_aes_128_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+				int len, unsigned char *pt, unsigned char *ct)
+{
+	int outlen, tmplen;
+	if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv)
+	    || (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+	    || (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))) {
+		printf("\n Error in openssl encoding of %d bytes\n", len);
+		return 1;
+	}
+	return 0;
+}
+
+/* Wrapper for OpenSSL EVP AES-XTS 128 decryption */
+static inline
+    int openssl_aes_128_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+				int len, unsigned char *ct, unsigned char *dt)
+{
+	int outlen, tmplen;
+	if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv)
+	    || (!EVP_DecryptUpdate(ctx, dt, &outlen, (const unsigned char *)ct, len))
+	    || (!EVP_DecryptFinal_ex(ctx, dt + outlen, &tmplen))) {
+		printf("\n Error in openssl decoding of %d bytes\n", len);
+		return 1;
+	}
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+
+	unsigned char key1[16], key2[16], tinit[16];
+	unsigned char *pt, *ct, *dt, *refct, *refdt;
+	unsigned char keyssl[32];	/* SSL takes both keys together */
+	unsigned int rand_len, t;
+	int i, j, k, ret;
+	int seed;
+
+	if (argc == 1)
+		seed = TEST_SEED;
+	else
+		seed = atoi(argv[1]);
+
+	srand(seed);
+	printf("SEED: %d\n", seed);
+
+	/* Initialise our cipher context, which can use same input vectors */
+	EVP_CIPHER_CTX *ctx;
+	ctx = EVP_CIPHER_CTX_new();
+
+	/* Allocate space for input and output buffers */
+	pt = malloc(TEST_LEN);
+	ct = malloc(TEST_LEN);
+	dt = malloc(TEST_LEN);
+	refct = malloc(TEST_LEN);
+	refdt = malloc(TEST_LEN);
+
+	if (NULL == pt || NULL == ct || NULL == dt || NULL == refct || NULL == refdt) {
+		printf("malloc of testsize failed\n");
+		return -1;
+	}
+
+	/**************************** LENGTH SCAN TEST *************************/
+	printf("aes_xts_128_rand_ossl test, %d sets of various length: ", 2 * 1024);
+
+	mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+	/* Set up key for the SSL engine */
+	for (k = 0; k < 16; k++) {
+		keyssl[k] = key1[k];
+		keyssl[k + 16] = key2[k];
+	}
+
+	for (ret = 0, i = 16; ret == 0 && i < LENGTH_SCAN; i++) {
+
+		/* Encrypt using each method */
+		XTS_AES_128_enc(key2, key1, tinit, i, pt, ct);
+		ret |= openssl_aes_128_xts_enc(ctx, keyssl, tinit, i, pt, refct);
+
+		// Compare
+		for (ret = 0, j = 0; j < i && ret == 0; j++) {
+			if (ct[j] != refct[j])
+				ret = 1;
+		}
+		if (ret)
+			printf(" XTS_AES_128_enc size=%d failed at byte %d!\n", i, j);
+
+		/* Decrypt using each method */
+		XTS_AES_128_dec(key2, key1, tinit, i, ct, dt);
+		ret |= openssl_aes_128_xts_dec(ctx, keyssl, tinit, i, refct, refdt);
+
+		for (k = 0, j = 0; j < TEST_LEN && ret == 0; j++) {
+			if (dt[j] != refdt[j])
+				ret = 1;
+		}
+		if (ret)
+			printf(" XTS_AES_128_dec size=%d failed at byte %d!\n", i, j);
+		if (0 == i % (LENGTH_SCAN / 16))
+			printf(".");
+		fflush(0);
+	}
+	if (ret)
+		return -1;
+	printf("Pass\n");
+
+	/**************************** FIXED LENGTH TEST *************************/
+	printf("aes_xts_128_rand_ossl test, %d sets of length %d: ", TEST_LOOPS, TEST_LEN);
+
+	// Loop over the vectors
+	for (i = 0; i < TEST_LOOPS; i++) {
+
+		mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+		/* Set up key for the SSL engine */
+		for (k = 0; k < 16; k++) {
+			keyssl[k] = key1[k];
+			keyssl[k + 16] = key2[k];
+		}
+
+		/* Encrypt using each method */
+		XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+		if (openssl_aes_128_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct))
+			return -1;
+
+		/* Carry out comparison of the calculated ciphertext with
+		 * the reference
+		 */
+		for (j = 0; j < TEST_LEN; j++) {
+
+			if (ct[j] != refct[j]) {
+				printf("XTS_AES_128_enc failed at byte %d! \n", j);
+				return -1;
+			}
+		}
+
+		/* Decrypt using each method */
+		XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+		if (openssl_aes_128_xts_dec(ctx, keyssl, tinit, TEST_LEN, refct, refdt))
+			return -1;
+
+		for (j = 0; j < TEST_LEN; j++) {
+
+			if (dt[j] != refdt[j]) {
+				printf("XTS_AES_128_dec failed at byte %d! \n", j);
+				return -1;
+			}
+		}
+		if (0 == i % (TEST_LOOPS / 16))
+			printf(".");
+		fflush(0);
+	}
+	printf("Pass\n");
+
+	/**************************** RANDOM LENGTH TEST *************************/
+	printf("aes_xts_128_rand_ossl test, %d sets of random lengths: ", RANDOMS);
+
+	/* Run tests with random size */
+
+	for (t = 0; t < RANDOMS; t++) {
+
+		rand_len = rand() % (TEST_LEN);
+		rand_len = rand_len < 16 ? 16 : rand_len;
+		mk_rand_data(key1, key2, tinit, pt, rand_len);
+
+		/* Set up key for the SSL engine */
+		for (k = 0; k < 16; k++) {
+			keyssl[k] = key1[k];
+			keyssl[k + 16] = key2[k];
+		}
+
+		/* Encrypt using each method */
+		XTS_AES_128_enc(key2, key1, tinit, rand_len, pt, ct);
+		if (openssl_aes_128_xts_enc(ctx, keyssl, tinit, rand_len, pt, refct))
+			return -1;
+
+		/* Carry out comparison of the calculated ciphertext with
+		 * the reference
+		 */
+		for (j = 0; j < rand_len; j++) {
+
+			if (ct[j] != refct[j]) {
+				printf("XTS_AES_128_enc failed at byte %d! \n", j);
+				return -1;
+			}
+		}
+
+		/* Decrypt using each method */
+		XTS_AES_128_dec(key2, key1, tinit, rand_len, ct, dt);
+		if (openssl_aes_128_xts_dec(ctx, keyssl, tinit, rand_len, refct, refdt))
+			return -1;
+
+		for (j = 0; j < rand_len; j++) {
+
+			if (dt[j] != refdt[j]) {
+				printf("XTS_AES_128_dec failed at byte %d! \n", j);
+				return -1;
+			}
+		}
+		if (0 == t % (RANDOMS / 16))
+			printf(".");
+		fflush(0);
+	}
+
+	EVP_CIPHER_CTX_free(ctx);
+
+	printf("Pass\n");
+
+	printf("aes_xts_128_rand_ossl: All tests passed\n");
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c
new file mode 100644
index 000000000..5dd57e33c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c
@@ -0,0 +1,106 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "xts_128_vect.h"
+
+int main(void)
+{
+
+	// Temporary array for the calculated vectors
+	uint8_t *ct_test;
+	uint8_t *pt_test;
+
+	int i, j;
+
+	// --- Encryption test ---
+
+	// Loop over the vectors
+	for (i = 0; i < NVEC; i++) {
+
+		// Allocate space for the calculated ciphertext
+		ct_test = malloc(vlist[i].ptlen);
+		if (ct_test == NULL) {
+			fprintf(stderr, "Can't allocate ciphertext memory\n");
+			return -1;
+		}
+
+		XTS_AES_128_enc(vlist[i].key2, vlist[i].key1, vlist[i].TW,
+				vlist[i].ptlen, vlist[i].PTX, ct_test);
+
+		// Carry out comparison of the calculated ciphertext with
+		// the reference
+		for (j = 0; j < vlist[i].ptlen; j++) {
+
+			if (ct_test[j] != vlist[i].CTX[j]) {
+				// Vectors 1-10 and 15-19 are for the 128 bit code
+				printf("\nXTS_AES_128_enc: Vector %d: ",
+				       i < 9 ? i + 1 : i + 6);
+
+				printf("failed at byte %d! \n", j);
+				return -1;
+			}
+		}
+		printf(".");
+	}
+
+	// --- Decryption test ---
+
+	// Loop over the vectors
+	for (i = 0; i < NVEC; i++) {
+
+		// Allocate space for the calculated ciphertext
+		pt_test = malloc(vlist[i].ptlen);
+		if (pt_test == NULL) {
+			fprintf(stderr, "Can't allocate plaintext memory\n");
+			return -1;
+		}
+
+		XTS_AES_128_dec(vlist[i].key2, vlist[i].key1, vlist[i].TW,
+				vlist[i].ptlen, vlist[i].CTX, pt_test);
+
+		for (j = 0; j < vlist[i].ptlen; j++) {
+
+			if (pt_test[j] != vlist[i].PTX[j]) {
+				// Carry out comparison of the calculated ciphertext with
+				// the reference
+				printf("\nXTS_AES_128_enc: Vector %d: ",
+				       i < 9 ? i + 1 : i + 6);
+
+				printf(" failed at byte %d! \n", j);
+				return -1;
+			}
+		}
+		printf(".");
+	}
+	printf("Pass\n");
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h
new file mode 100644
index 000000000..fce792dc7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h
@@ -0,0 +1,1691 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aes_xts.h"
+
+#define NVEC 14
+
+// struct to hold pointers to the key, plaintext and ciphertext vectors
+struct xts_vector {
+	uint64_t ptlen;		// length of our plaintext
+	uint8_t *key1;		// dimension 16 for 128 bit aes
+	uint8_t *key2;		// dimension 16 for 128 bit aes
+	uint8_t *TW;		// dimension 16 for both 128 and 256 bit
+	uint8_t *PTX;		// min. dimension 16
+	uint8_t *CTX;		// same dimension as PTX
+};
+
+/* Define our test vectors statically here. Test vectors are from the standard:
+ * "IEEE Standard for Cryptographic Protection of Data on Block-Oriented
+ * Storage Devices"
+ * http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4493450
+ *
+ * Vector 1
+ * Key1 00000000000000000000000000000000
+ * Key2 00000000000000000000000000000000
+ * Data Unit Sequence number 0
+ * PTX 0000000000000000000000000000000000000000000000000000000000000000 /128bit
+ * TWK 66e94bd4ef8a2c3b884cfa59ca342b2eccd297a8df1559761099f4b39469565c
+ * CTX 917cf69ebd68b2ec9b9fe9a3eadda692cd43d2f59598ed858c02c2652fbf922e
+ * Plaintext length (bytes): 32
+ */
+
+static uint8_t v1_key1[16] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v1_key2[16] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v1_TW[16] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v1_PTX[32] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v1_CTX[32] = {
+	0x91, 0x7c, 0xf6, 0x9e, 0xbd, 0x68, 0xb2, 0xec,
+	0x9b, 0x9f, 0xe9, 0xa3, 0xea, 0xdd, 0xa6, 0x92,
+	0xcd, 0x43, 0xd2, 0xf5, 0x95, 0x98, 0xed, 0x85,
+	0x8c, 0x02, 0xc2, 0x65, 0x2f, 0xbf, 0x92, 0x2e
+};
+
+/*
+ * Vector 2
+ * Key1 11111111111111111111111111111111
+ * Key2 22222222222222222222222222222222
+ * Data Unit Sequence number 3333333333
+ * PTX 4444444444444444444444444444444444444444444444444444444444444444
+ * TWK 3f803bcd0d7fd2b37558419f59d5cda6f900779a1bfea467ebb0823eb3aa9b4d
+ * CTX c454185e6a16936e39334038acef838bfb186fff7480adc4289382ecd6d394f0
+ * Plaintext length (bytes): 32
+ */
+
+static uint8_t v2_key1[16] = {
+	0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+	0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11
+};
+
+static uint8_t v2_key2[16] = {
+	0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+	0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22
+};
+
+static uint8_t v2_TW[16] = {
+	0x33, 0x33, 0x33, 0x33, 0x33, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v2_PTX[32] = {
+	0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+	0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+	0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+	0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44
+};
+
+static uint8_t v2_CTX[32] = {
+	0xc4, 0x54, 0x18, 0x5e, 0x6a, 0x16, 0x93, 0x6e,
+	0x39, 0x33, 0x40, 0x38, 0xac, 0xef, 0x83, 0x8b,
+	0xfb, 0x18, 0x6f, 0xff, 0x74, 0x80, 0xad, 0xc4,
+	0x28, 0x93, 0x82, 0xec, 0xd6, 0xd3, 0x94, 0xf0
+};
+
+/*
+ * Vector 3
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 22222222222222222222222222222222
+ * Data Unit Sequence number 3333333333
+ * PTX 4444444444444444444444444444444444444444444444444444444444444444
+ * TWK 3f803bcd0d7fd2b37558419f59d5cda6f900779a1bfea467ebb0823eb3aa9b4d
+ * CTX af85336b597afc1a900b2eb21ec949d292df4c047e0b21532186a5971a227a89
+ * Plaintext length (bytes): 32
+ */
+
+static uint8_t v3_key1[16] = {
+	0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+	0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v3_key2[16] = {
+	0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+	0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22
+};
+
+static uint8_t v3_TW[16] = {
+	0x33, 0x33, 0x33, 0x33, 0x33, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v3_PTX[32] = {
+	0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+	0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+	0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+	0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44
+};
+
+static uint8_t v3_CTX[32] = {
+	0xaf, 0x85, 0x33, 0x6b, 0x59, 0x7a, 0xfc, 0x1a,
+	0x90, 0x0b, 0x2e, 0xb2, 0x1e, 0xc9, 0x49, 0xd2,
+	0x92, 0xdf, 0x4c, 0x04, 0x7e, 0x0b, 0x21, 0x53,
+	0x21, 0x86, 0xa5, 0x97, 0x1a, 0x22, 0x7a, 0x89
+};
+
+/*
+ * Vector 4
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence number 0
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 27a7479befa1d476489f308cd4cfa6e2a96e4bbe3208ff25287dd3819616e89c
+ * CTX c78cf7f5e543445f8333d8fa7f56000005279fa5d8b5e4ad40e736ddb4d35412
+ * CTX 328063fd2aab53e5ea1e0a9f332500a5df9487d07a5c92cc512c8866c7e860ce
+ * CTX 93fdf166a24912b422976146ae20ce846bb7dc9ba94a767aaef20c0d61ad0265
+ * CTX 5ea92dc4c4e41a8952c651d33174be51a10c421110e6d81588ede82103a252d8
+ * CTX a750e8768defffed9122810aaeb99f9172af82b604dc4b8e51bcb08235a6f434
+ * CTX 1332e4ca60482a4ba1a03b3e65008fc5da76b70bf1690db4eae29c5f1badd03c
+ * CTX 5ccf2a55d705ddcd86d449511ceb7ec30bf12b1fa35b913f9f747a8afd1b130e
+ * CTX 94bff94effd01a91735ca1726acd0b197c4e5b03393697e126826fb6bbde8ecc
+ * CTX 1e08298516e2c9ed03ff3c1b7860f6de76d4cecd94c8119855ef5297ca67e9f3
+ * CTX e7ff72b1e99785ca0a7e7720c5b36dc6d72cac9574c8cbbc2f801e23e56fd344
+ * CTX b07f22154beba0f08ce8891e643ed995c94d9a69c9f1b5f499027a78572aeebd
+ * CTX 74d20cc39881c213ee770b1010e4bea718846977ae119f7a023ab58cca0ad752
+ * CTX afe656bb3c17256a9f6e9bf19fdd5a38fc82bbe872c5539edb609ef4f79c203e
+ * CTX bb140f2e583cb2ad15b4aa5b655016a8449277dbd477ef2c8d6c017db738b18d
+ * CTX eb4a427d1923ce3ff262735779a418f20a282df920147beabe421ee5319d0568
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v4_key1[16] = {
+	0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+	0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v4_key2[16] = {
+	0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+	0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v4_TW[16] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v4_PTX[512] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v4_CTX[512] = {
+	0x27, 0xa7, 0x47, 0x9b, 0xef, 0xa1, 0xd4, 0x76,
+	0x48, 0x9f, 0x30, 0x8c, 0xd4, 0xcf, 0xa6, 0xe2,
+	0xa9, 0x6e, 0x4b, 0xbe, 0x32, 0x08, 0xff, 0x25,
+	0x28, 0x7d, 0xd3, 0x81, 0x96, 0x16, 0xe8, 0x9c,
+	0xc7, 0x8c, 0xf7, 0xf5, 0xe5, 0x43, 0x44, 0x5f,
+	0x83, 0x33, 0xd8, 0xfa, 0x7f, 0x56, 0x00, 0x00,
+	0x05, 0x27, 0x9f, 0xa5, 0xd8, 0xb5, 0xe4, 0xad,
+	0x40, 0xe7, 0x36, 0xdd, 0xb4, 0xd3, 0x54, 0x12,
+	0x32, 0x80, 0x63, 0xfd, 0x2a, 0xab, 0x53, 0xe5,
+	0xea, 0x1e, 0x0a, 0x9f, 0x33, 0x25, 0x00, 0xa5,
+	0xdf, 0x94, 0x87, 0xd0, 0x7a, 0x5c, 0x92, 0xcc,
+	0x51, 0x2c, 0x88, 0x66, 0xc7, 0xe8, 0x60, 0xce,
+	0x93, 0xfd, 0xf1, 0x66, 0xa2, 0x49, 0x12, 0xb4,
+	0x22, 0x97, 0x61, 0x46, 0xae, 0x20, 0xce, 0x84,
+	0x6b, 0xb7, 0xdc, 0x9b, 0xa9, 0x4a, 0x76, 0x7a,
+	0xae, 0xf2, 0x0c, 0x0d, 0x61, 0xad, 0x02, 0x65,
+	0x5e, 0xa9, 0x2d, 0xc4, 0xc4, 0xe4, 0x1a, 0x89,
+	0x52, 0xc6, 0x51, 0xd3, 0x31, 0x74, 0xbe, 0x51,
+	0xa1, 0x0c, 0x42, 0x11, 0x10, 0xe6, 0xd8, 0x15,
+	0x88, 0xed, 0xe8, 0x21, 0x03, 0xa2, 0x52, 0xd8,
+	0xa7, 0x50, 0xe8, 0x76, 0x8d, 0xef, 0xff, 0xed,
+	0x91, 0x22, 0x81, 0x0a, 0xae, 0xb9, 0x9f, 0x91,
+	0x72, 0xaf, 0x82, 0xb6, 0x04, 0xdc, 0x4b, 0x8e,
+	0x51, 0xbc, 0xb0, 0x82, 0x35, 0xa6, 0xf4, 0x34,
+	0x13, 0x32, 0xe4, 0xca, 0x60, 0x48, 0x2a, 0x4b,
+	0xa1, 0xa0, 0x3b, 0x3e, 0x65, 0x00, 0x8f, 0xc5,
+	0xda, 0x76, 0xb7, 0x0b, 0xf1, 0x69, 0x0d, 0xb4,
+	0xea, 0xe2, 0x9c, 0x5f, 0x1b, 0xad, 0xd0, 0x3c,
+	0x5c, 0xcf, 0x2a, 0x55, 0xd7, 0x05, 0xdd, 0xcd,
+	0x86, 0xd4, 0x49, 0x51, 0x1c, 0xeb, 0x7e, 0xc3,
+	0x0b, 0xf1, 0x2b, 0x1f, 0xa3, 0x5b, 0x91, 0x3f,
+	0x9f, 0x74, 0x7a, 0x8a, 0xfd, 0x1b, 0x13, 0x0e,
+	0x94, 0xbf, 0xf9, 0x4e, 0xff, 0xd0, 0x1a, 0x91,
+	0x73, 0x5c, 0xa1, 0x72, 0x6a, 0xcd, 0x0b, 0x19,
+	0x7c, 0x4e, 0x5b, 0x03, 0x39, 0x36, 0x97, 0xe1,
+	0x26, 0x82, 0x6f, 0xb6, 0xbb, 0xde, 0x8e, 0xcc,
+	0x1e, 0x08, 0x29, 0x85, 0x16, 0xe2, 0xc9, 0xed,
+	0x03, 0xff, 0x3c, 0x1b, 0x78, 0x60, 0xf6, 0xde,
+	0x76, 0xd4, 0xce, 0xcd, 0x94, 0xc8, 0x11, 0x98,
+	0x55, 0xef, 0x52, 0x97, 0xca, 0x67, 0xe9, 0xf3,
+	0xe7, 0xff, 0x72, 0xb1, 0xe9, 0x97, 0x85, 0xca,
+	0x0a, 0x7e, 0x77, 0x20, 0xc5, 0xb3, 0x6d, 0xc6,
+	0xd7, 0x2c, 0xac, 0x95, 0x74, 0xc8, 0xcb, 0xbc,
+	0x2f, 0x80, 0x1e, 0x23, 0xe5, 0x6f, 0xd3, 0x44,
+	0xb0, 0x7f, 0x22, 0x15, 0x4b, 0xeb, 0xa0, 0xf0,
+	0x8c, 0xe8, 0x89, 0x1e, 0x64, 0x3e, 0xd9, 0x95,
+	0xc9, 0x4d, 0x9a, 0x69, 0xc9, 0xf1, 0xb5, 0xf4,
+	0x99, 0x02, 0x7a, 0x78, 0x57, 0x2a, 0xee, 0xbd,
+	0x74, 0xd2, 0x0c, 0xc3, 0x98, 0x81, 0xc2, 0x13,
+	0xee, 0x77, 0x0b, 0x10, 0x10, 0xe4, 0xbe, 0xa7,
+	0x18, 0x84, 0x69, 0x77, 0xae, 0x11, 0x9f, 0x7a,
+	0x02, 0x3a, 0xb5, 0x8c, 0xca, 0x0a, 0xd7, 0x52,
+	0xaf, 0xe6, 0x56, 0xbb, 0x3c, 0x17, 0x25, 0x6a,
+	0x9f, 0x6e, 0x9b, 0xf1, 0x9f, 0xdd, 0x5a, 0x38,
+	0xfc, 0x82, 0xbb, 0xe8, 0x72, 0xc5, 0x53, 0x9e,
+	0xdb, 0x60, 0x9e, 0xf4, 0xf7, 0x9c, 0x20, 0x3e,
+	0xbb, 0x14, 0x0f, 0x2e, 0x58, 0x3c, 0xb2, 0xad,
+	0x15, 0xb4, 0xaa, 0x5b, 0x65, 0x50, 0x16, 0xa8,
+	0x44, 0x92, 0x77, 0xdb, 0xd4, 0x77, 0xef, 0x2c,
+	0x8d, 0x6c, 0x01, 0x7d, 0xb7, 0x38, 0xb1, 0x8d,
+	0xeb, 0x4a, 0x42, 0x7d, 0x19, 0x23, 0xce, 0x3f,
+	0xf2, 0x62, 0x73, 0x57, 0x79, 0xa4, 0x18, 0xf2,
+	0x0a, 0x28, 0x2d, 0xf9, 0x20, 0x14, 0x7b, 0xea,
+	0xbe, 0x42, 0x1e, 0xe5, 0x31, 0x9d, 0x05, 0x68
+};
+
+/*
+ * Vector 5
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number 01
+ * PTX 27a7479befa1d476489f308cd4cfa6e2a96e4bbe3208ff25287dd3819616e89c
+ * PTX c78cf7f5e543445f8333d8fa7f56000005279fa5d8b5e4ad40e736ddb4d35412
+ * PTX 328063fd2aab53e5ea1e0a9f332500a5df9487d07a5c92cc512c8866c7e860ce
+ * PTX 93fdf166a24912b422976146ae20ce846bb7dc9ba94a767aaef20c0d61ad0265
+ * PTX 5ea92dc4c4e41a8952c651d33174be51a10c421110e6d81588ede82103a252d8
+ * PTX a750e8768defffed9122810aaeb99f9172af82b604dc4b8e51bcb08235a6f434
+ * PTX 1332e4ca60482a4ba1a03b3e65008fc5da76b70bf1690db4eae29c5f1badd03c
+ * PTX 5ccf2a55d705ddcd86d449511ceb7ec30bf12b1fa35b913f9f747a8afd1b130e
+ * PTX 94bff94effd01a91735ca1726acd0b197c4e5b03393697e126826fb6bbde8ecc
+ * PTX 1e08298516e2c9ed03ff3c1b7860f6de76d4cecd94c8119855ef5297ca67e9f3
+ * PTX e7ff72b1e99785ca0a7e7720c5b36dc6d72cac9574c8cbbc2f801e23e56fd344
+ * PTX b07f22154beba0f08ce8891e643ed995c94d9a69c9f1b5f499027a78572aeebd
+ * PTX 74d20cc39881c213ee770b1010e4bea718846977ae119f7a023ab58cca0ad752
+ * PTX afe656bb3c17256a9f6e9bf19fdd5a38fc82bbe872c5539edb609ef4f79c203e
+ * PTX bb140f2e583cb2ad15b4aa5b655016a8449277dbd477ef2c8d6c017db738b18d
+ * PTX eb4a427d1923ce3ff262735779a418f20a282df920147beabe421ee5319d0568
+ * CTX 264d3ca8512194fec312c8c9891f279fefdd608d0c027b60483a3fa811d65ee5
+ * CTX 9d52d9e40ec5672d81532b38b6b089ce951f0f9c35590b8b978d175213f329bb
+ * CTX 1c2fd30f2f7f30492a61a532a79f51d36f5e31a7c9a12c286082ff7d2394d18f
+ * CTX 783e1a8e72c722caaaa52d8f065657d2631fd25bfd8e5baad6e527d763517501
+ * CTX c68c5edc3cdd55435c532d7125c8614deed9adaa3acade5888b87bef641c4c99
+ * CTX 4c8091b5bcd387f3963fb5bc37aa922fbfe3df4e5b915e6eb514717bdd2a7407
+ * CTX 9a5073f5c4bfd46adf7d282e7a393a52579d11a028da4d9cd9c77124f9648ee3
+ * CTX 83b1ac763930e7162a8d37f350b2f74b8472cf09902063c6b32e8c2d9290cefb
+ * CTX d7346d1c779a0df50edcde4531da07b099c638e83a755944df2aef1aa31752fd
+ * CTX 323dcb710fb4bfbb9d22b925bc3577e1b8949e729a90bbafeacf7f7879e7b114
+ * CTX 7e28ba0bae940db795a61b15ecf4df8db07b824bb062802cc98a9545bb2aaeed
+ * CTX 77cb3fc6db15dcd7d80d7d5bc406c4970a3478ada8899b329198eb61c193fb62
+ * CTX 75aa8ca340344a75a862aebe92eee1ce032fd950b47d7704a3876923b4ad6284
+ * CTX 4bf4a09c4dbe8b4397184b7471360c9564880aedddb9baa4af2e75394b08cd32
+ * CTX ff479c57a07d3eab5d54de5f9738b8d27f27a9f0ab11799d7b7ffefb2704c95c
+ * CTX 6ad12c39f1e867a4b7b1d7818a4b753dfd2a89ccb45e001a03a867b187f225dd
+ * Plaintext length (bytes): 512
+ */
+
+static uint8_t v5_key1[16] = {
+	0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+	0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v5_key2[16] = {
+	0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+	0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v5_TW[16] = {
+	0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v5_PTX[512] = {
+	0x27, 0xa7, 0x47, 0x9b, 0xef, 0xa1, 0xd4, 0x76,
+	0x48, 0x9f, 0x30, 0x8c, 0xd4, 0xcf, 0xa6, 0xe2,
+	0xa9, 0x6e, 0x4b, 0xbe, 0x32, 0x08, 0xff, 0x25,
+	0x28, 0x7d, 0xd3, 0x81, 0x96, 0x16, 0xe8, 0x9c,
+	0xc7, 0x8c, 0xf7, 0xf5, 0xe5, 0x43, 0x44, 0x5f,
+	0x83, 0x33, 0xd8, 0xfa, 0x7f, 0x56, 0x00, 0x00,
+	0x05, 0x27, 0x9f, 0xa5, 0xd8, 0xb5, 0xe4, 0xad,
+	0x40, 0xe7, 0x36, 0xdd, 0xb4, 0xd3, 0x54, 0x12,
+	0x32, 0x80, 0x63, 0xfd, 0x2a, 0xab, 0x53, 0xe5,
+	0xea, 0x1e, 0x0a, 0x9f, 0x33, 0x25, 0x00, 0xa5,
+	0xdf, 0x94, 0x87, 0xd0, 0x7a, 0x5c, 0x92, 0xcc,
+	0x51, 0x2c, 0x88, 0x66, 0xc7, 0xe8, 0x60, 0xce,
+	0x93, 0xfd, 0xf1, 0x66, 0xa2, 0x49, 0x12, 0xb4,
+	0x22, 0x97, 0x61, 0x46, 0xae, 0x20, 0xce, 0x84,
+	0x6b, 0xb7, 0xdc, 0x9b, 0xa9, 0x4a, 0x76, 0x7a,
+	0xae, 0xf2, 0x0c, 0x0d, 0x61, 0xad, 0x02, 0x65,
+	0x5e, 0xa9, 0x2d, 0xc4, 0xc4, 0xe4, 0x1a, 0x89,
+	0x52, 0xc6, 0x51, 0xd3, 0x31, 0x74, 0xbe, 0x51,
+	0xa1, 0x0c, 0x42, 0x11, 0x10, 0xe6, 0xd8, 0x15,
+	0x88, 0xed, 0xe8, 0x21, 0x03, 0xa2, 0x52, 0xd8,
+	0xa7, 0x50, 0xe8, 0x76, 0x8d, 0xef, 0xff, 0xed,
+	0x91, 0x22, 0x81, 0x0a, 0xae, 0xb9, 0x9f, 0x91,
+	0x72, 0xaf, 0x82, 0xb6, 0x04, 0xdc, 0x4b, 0x8e,
+	0x51, 0xbc, 0xb0, 0x82, 0x35, 0xa6, 0xf4, 0x34,
+	0x13, 0x32, 0xe4, 0xca, 0x60, 0x48, 0x2a, 0x4b,
+	0xa1, 0xa0, 0x3b, 0x3e, 0x65, 0x00, 0x8f, 0xc5,
+	0xda, 0x76, 0xb7, 0x0b, 0xf1, 0x69, 0x0d, 0xb4,
+	0xea, 0xe2, 0x9c, 0x5f, 0x1b, 0xad, 0xd0, 0x3c,
+	0x5c, 0xcf, 0x2a, 0x55, 0xd7, 0x05, 0xdd, 0xcd,
+	0x86, 0xd4, 0x49, 0x51, 0x1c, 0xeb, 0x7e, 0xc3,
+	0x0b, 0xf1, 0x2b, 0x1f, 0xa3, 0x5b, 0x91, 0x3f,
+	0x9f, 0x74, 0x7a, 0x8a, 0xfd, 0x1b, 0x13, 0x0e,
+	0x94, 0xbf, 0xf9, 0x4e, 0xff, 0xd0, 0x1a, 0x91,
+	0x73, 0x5c, 0xa1, 0x72, 0x6a, 0xcd, 0x0b, 0x19,
+	0x7c, 0x4e, 0x5b, 0x03, 0x39, 0x36, 0x97, 0xe1,
+	0x26, 0x82, 0x6f, 0xb6, 0xbb, 0xde, 0x8e, 0xcc,
+	0x1e, 0x08, 0x29, 0x85, 0x16, 0xe2, 0xc9, 0xed,
+	0x03, 0xff, 0x3c, 0x1b, 0x78, 0x60, 0xf6, 0xde,
+	0x76, 0xd4, 0xce, 0xcd, 0x94, 0xc8, 0x11, 0x98,
+	0x55, 0xef, 0x52, 0x97, 0xca, 0x67, 0xe9, 0xf3,
+	0xe7, 0xff, 0x72, 0xb1, 0xe9, 0x97, 0x85, 0xca,
+	0x0a, 0x7e, 0x77, 0x20, 0xc5, 0xb3, 0x6d, 0xc6,
+	0xd7, 0x2c, 0xac, 0x95, 0x74, 0xc8, 0xcb, 0xbc,
+	0x2f, 0x80, 0x1e, 0x23, 0xe5, 0x6f, 0xd3, 0x44,
+	0xb0, 0x7f, 0x22, 0x15, 0x4b, 0xeb, 0xa0, 0xf0,
+	0x8c, 0xe8, 0x89, 0x1e, 0x64, 0x3e, 0xd9, 0x95,
+	0xc9, 0x4d, 0x9a, 0x69, 0xc9, 0xf1, 0xb5, 0xf4,
+	0x99, 0x02, 0x7a, 0x78, 0x57, 0x2a, 0xee, 0xbd,
+	0x74, 0xd2, 0x0c, 0xc3, 0x98, 0x81, 0xc2, 0x13,
+	0xee, 0x77, 0x0b, 0x10, 0x10, 0xe4, 0xbe, 0xa7,
+	0x18, 0x84, 0x69, 0x77, 0xae, 0x11, 0x9f, 0x7a,
+	0x02, 0x3a, 0xb5, 0x8c, 0xca, 0x0a, 0xd7, 0x52,
+	0xaf, 0xe6, 0x56, 0xbb, 0x3c, 0x17, 0x25, 0x6a,
+	0x9f, 0x6e, 0x9b, 0xf1, 0x9f, 0xdd, 0x5a, 0x38,
+	0xfc, 0x82, 0xbb, 0xe8, 0x72, 0xc5, 0x53, 0x9e,
+	0xdb, 0x60, 0x9e, 0xf4, 0xf7, 0x9c, 0x20, 0x3e,
+	0xbb, 0x14, 0x0f, 0x2e, 0x58, 0x3c, 0xb2, 0xad,
+	0x15, 0xb4, 0xaa, 0x5b, 0x65, 0x50, 0x16, 0xa8,
+	0x44, 0x92, 0x77, 0xdb, 0xd4, 0x77, 0xef, 0x2c,
+	0x8d, 0x6c, 0x01, 0x7d, 0xb7, 0x38, 0xb1, 0x8d,
+	0xeb, 0x4a, 0x42, 0x7d, 0x19, 0x23, 0xce, 0x3f,
+	0xf2, 0x62, 0x73, 0x57, 0x79, 0xa4, 0x18, 0xf2,
+	0x0a, 0x28, 0x2d, 0xf9, 0x20, 0x14, 0x7b, 0xea,
+	0xbe, 0x42, 0x1e, 0xe5, 0x31, 0x9d, 0x05, 0x68
+};
+
+static uint8_t v5_CTX[512] = {
+	0x26, 0x4d, 0x3c, 0xa8, 0x51, 0x21, 0x94, 0xfe,
+	0xc3, 0x12, 0xc8, 0xc9, 0x89, 0x1f, 0x27, 0x9f,
+	0xef, 0xdd, 0x60, 0x8d, 0x0c, 0x02, 0x7b, 0x60,
+	0x48, 0x3a, 0x3f, 0xa8, 0x11, 0xd6, 0x5e, 0xe5,
+	0x9d, 0x52, 0xd9, 0xe4, 0x0e, 0xc5, 0x67, 0x2d,
+	0x81, 0x53, 0x2b, 0x38, 0xb6, 0xb0, 0x89, 0xce,
+	0x95, 0x1f, 0x0f, 0x9c, 0x35, 0x59, 0x0b, 0x8b,
+	0x97, 0x8d, 0x17, 0x52, 0x13, 0xf3, 0x29, 0xbb,
+	0x1c, 0x2f, 0xd3, 0x0f, 0x2f, 0x7f, 0x30, 0x49,
+	0x2a, 0x61, 0xa5, 0x32, 0xa7, 0x9f, 0x51, 0xd3,
+	0x6f, 0x5e, 0x31, 0xa7, 0xc9, 0xa1, 0x2c, 0x28,
+	0x60, 0x82, 0xff, 0x7d, 0x23, 0x94, 0xd1, 0x8f,
+	0x78, 0x3e, 0x1a, 0x8e, 0x72, 0xc7, 0x22, 0xca,
+	0xaa, 0xa5, 0x2d, 0x8f, 0x06, 0x56, 0x57, 0xd2,
+	0x63, 0x1f, 0xd2, 0x5b, 0xfd, 0x8e, 0x5b, 0xaa,
+	0xd6, 0xe5, 0x27, 0xd7, 0x63, 0x51, 0x75, 0x01,
+	0xc6, 0x8c, 0x5e, 0xdc, 0x3c, 0xdd, 0x55, 0x43,
+	0x5c, 0x53, 0x2d, 0x71, 0x25, 0xc8, 0x61, 0x4d,
+	0xee, 0xd9, 0xad, 0xaa, 0x3a, 0xca, 0xde, 0x58,
+	0x88, 0xb8, 0x7b, 0xef, 0x64, 0x1c, 0x4c, 0x99,
+	0x4c, 0x80, 0x91, 0xb5, 0xbc, 0xd3, 0x87, 0xf3,
+	0x96, 0x3f, 0xb5, 0xbc, 0x37, 0xaa, 0x92, 0x2f,
+	0xbf, 0xe3, 0xdf, 0x4e, 0x5b, 0x91, 0x5e, 0x6e,
+	0xb5, 0x14, 0x71, 0x7b, 0xdd, 0x2a, 0x74, 0x07,
+	0x9a, 0x50, 0x73, 0xf5, 0xc4, 0xbf, 0xd4, 0x6a,
+	0xdf, 0x7d, 0x28, 0x2e, 0x7a, 0x39, 0x3a, 0x52,
+	0x57, 0x9d, 0x11, 0xa0, 0x28, 0xda, 0x4d, 0x9c,
+	0xd9, 0xc7, 0x71, 0x24, 0xf9, 0x64, 0x8e, 0xe3,
+	0x83, 0xb1, 0xac, 0x76, 0x39, 0x30, 0xe7, 0x16,
+	0x2a, 0x8d, 0x37, 0xf3, 0x50, 0xb2, 0xf7, 0x4b,
+	0x84, 0x72, 0xcf, 0x09, 0x90, 0x20, 0x63, 0xc6,
+	0xb3, 0x2e, 0x8c, 0x2d, 0x92, 0x90, 0xce, 0xfb,
+	0xd7, 0x34, 0x6d, 0x1c, 0x77, 0x9a, 0x0d, 0xf5,
+	0x0e, 0xdc, 0xde, 0x45, 0x31, 0xda, 0x07, 0xb0,
+	0x99, 0xc6, 0x38, 0xe8, 0x3a, 0x75, 0x59, 0x44,
+	0xdf, 0x2a, 0xef, 0x1a, 0xa3, 0x17, 0x52, 0xfd,
+	0x32, 0x3d, 0xcb, 0x71, 0x0f, 0xb4, 0xbf, 0xbb,
+	0x9d, 0x22, 0xb9, 0x25, 0xbc, 0x35, 0x77, 0xe1,
+	0xb8, 0x94, 0x9e, 0x72, 0x9a, 0x90, 0xbb, 0xaf,
+	0xea, 0xcf, 0x7f, 0x78, 0x79, 0xe7, 0xb1, 0x14,
+	0x7e, 0x28, 0xba, 0x0b, 0xae, 0x94, 0x0d, 0xb7,
+	0x95, 0xa6, 0x1b, 0x15, 0xec, 0xf4, 0xdf, 0x8d,
+	0xb0, 0x7b, 0x82, 0x4b, 0xb0, 0x62, 0x80, 0x2c,
+	0xc9, 0x8a, 0x95, 0x45, 0xbb, 0x2a, 0xae, 0xed,
+	0x77, 0xcb, 0x3f, 0xc6, 0xdb, 0x15, 0xdc, 0xd7,
+	0xd8, 0x0d, 0x7d, 0x5b, 0xc4, 0x06, 0xc4, 0x97,
+	0x0a, 0x34, 0x78, 0xad, 0xa8, 0x89, 0x9b, 0x32,
+	0x91, 0x98, 0xeb, 0x61, 0xc1, 0x93, 0xfb, 0x62,
+	0x75, 0xaa, 0x8c, 0xa3, 0x40, 0x34, 0x4a, 0x75,
+	0xa8, 0x62, 0xae, 0xbe, 0x92, 0xee, 0xe1, 0xce,
+	0x03, 0x2f, 0xd9, 0x50, 0xb4, 0x7d, 0x77, 0x04,
+	0xa3, 0x87, 0x69, 0x23, 0xb4, 0xad, 0x62, 0x84,
+	0x4b, 0xf4, 0xa0, 0x9c, 0x4d, 0xbe, 0x8b, 0x43,
+	0x97, 0x18, 0x4b, 0x74, 0x71, 0x36, 0x0c, 0x95,
+	0x64, 0x88, 0x0a, 0xed, 0xdd, 0xb9, 0xba, 0xa4,
+	0xaf, 0x2e, 0x75, 0x39, 0x4b, 0x08, 0xcd, 0x32,
+	0xff, 0x47, 0x9c, 0x57, 0xa0, 0x7d, 0x3e, 0xab,
+	0x5d, 0x54, 0xde, 0x5f, 0x97, 0x38, 0xb8, 0xd2,
+	0x7f, 0x27, 0xa9, 0xf0, 0xab, 0x11, 0x79, 0x9d,
+	0x7b, 0x7f, 0xfe, 0xfb, 0x27, 0x04, 0xc9, 0x5c,
+	0x6a, 0xd1, 0x2c, 0x39, 0xf1, 0xe8, 0x67, 0xa4,
+	0xb7, 0xb1, 0xd7, 0x81, 0x8a, 0x4b, 0x75, 0x3d,
+	0xfd, 0x2a, 0x89, 0xcc, 0xb4, 0x5e, 0x00, 0x1a,
+	0x03, 0xa8, 0x67, 0xb1, 0x87, 0xf2, 0x25, 0xdd
+};
+
+/*
+ * Vector 6
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number 02
+ * PTX 264d3ca8512194fec312c8c9891f279fefdd608d0c027b60483a3fa811d65ee5
+ * PTX 9d52d9e40ec5672d81532b38b6b089ce951f0f9c35590b8b978d175213f329bb
+ * PTX 1c2fd30f2f7f30492a61a532a79f51d36f5e31a7c9a12c286082ff7d2394d18f
+ * PTX 783e1a8e72c722caaaa52d8f065657d2631fd25bfd8e5baad6e527d763517501
+ * PTX c68c5edc3cdd55435c532d7125c8614deed9adaa3acade5888b87bef641c4c99
+ * PTX 4c8091b5bcd387f3963fb5bc37aa922fbfe3df4e5b915e6eb514717bdd2a7407
+ * PTX 9a5073f5c4bfd46adf7d282e7a393a52579d11a028da4d9cd9c77124f9648ee3
+ * PTX 83b1ac763930e7162a8d37f350b2f74b8472cf09902063c6b32e8c2d9290cefb
+ * PTX d7346d1c779a0df50edcde4531da07b099c638e83a755944df2aef1aa31752fd
+ * PTX 323dcb710fb4bfbb9d22b925bc3577e1b8949e729a90bbafeacf7f7879e7b114
+ * PTX 7e28ba0bae940db795a61b15ecf4df8db07b824bb062802cc98a9545bb2aaeed
+ * PTX 77cb3fc6db15dcd7d80d7d5bc406c4970a3478ada8899b329198eb61c193fb62
+ * PTX 75aa8ca340344a75a862aebe92eee1ce032fd950b47d7704a3876923b4ad6284
+ * PTX 4bf4a09c4dbe8b4397184b7471360c9564880aedddb9baa4af2e75394b08cd32
+ * PTX ff479c57a07d3eab5d54de5f9738b8d27f27a9f0ab11799d7b7ffefb2704c95c
+ * PTX 6ad12c39f1e867a4b7b1d7818a4b753dfd2a89ccb45e001a03a867b187f225dd
+ * CTX fa762a3680b76007928ed4a4f49a9456031b704782e65e16cecb54ed7d017b5e
+ * CTX 18abd67b338e81078f21edb7868d901ebe9c731a7c18b5e6dec1d6a72e078ac9
+ * CTX a4262f860beefa14f4e821018272e411a951502b6e79066e84252c3346f3aa62
+ * CTX 344351a291d4bedc7a07618bdea2af63145cc7a4b8d4070691ae890cd65733e7
+ * CTX 946e9021a1dffc4c59f159425ee6d50ca9b135fa6162cea18a939838dc000fb3
+ * CTX 86fad086acce5ac07cb2ece7fd580b00cfa5e98589631dc25e8e2a3daf2ffdec
+ * CTX 26531659912c9d8f7a15e5865ea8fb5816d6207052bd7128cd743c12c8118791
+ * CTX a4736811935eb982a532349e31dd401e0b660a568cb1a4711f552f55ded59f1f
+ * CTX 15bf7196b3ca12a91e488ef59d64f3a02bf45239499ac6176ae321c4a211ec54
+ * CTX 5365971c5d3f4f09d4eb139bfdf2073d33180b21002b65cc9865e76cb24cd92c
+ * CTX 874c24c18350399a936ab3637079295d76c417776b94efce3a0ef7206b151105
+ * CTX 19655c956cbd8b2489405ee2b09a6b6eebe0c53790a12a8998378b33a5b71159
+ * CTX 625f4ba49d2a2fdba59fbf0897bc7aabd8d707dc140a80f0f309f835d3da54ab
+ * CTX 584e501dfa0ee977fec543f74186a802b9a37adb3e8291eca04d66520d229e60
+ * CTX 401e7282bef486ae059aa70696e0e305d777140a7a883ecdcb69b9ff938e8a42
+ * CTX 31864c69ca2c2043bed007ff3e605e014bcf518138dc3a25c5e236171a2d01d6
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v6_key1[16] = {
+	0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+	0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v6_key2[16] = {
+	0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+	0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v6_TW[16] = {
+	0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v6_PTX[512] = {
+
+	0x26, 0x4d, 0x3c, 0xa8, 0x51, 0x21, 0x94, 0xfe,
+	0xc3, 0x12, 0xc8, 0xc9, 0x89, 0x1f, 0x27, 0x9f,
+	0xef, 0xdd, 0x60, 0x8d, 0x0c, 0x02, 0x7b, 0x60,
+	0x48, 0x3a, 0x3f, 0xa8, 0x11, 0xd6, 0x5e, 0xe5,
+	0x9d, 0x52, 0xd9, 0xe4, 0x0e, 0xc5, 0x67, 0x2d,
+	0x81, 0x53, 0x2b, 0x38, 0xb6, 0xb0, 0x89, 0xce,
+	0x95, 0x1f, 0x0f, 0x9c, 0x35, 0x59, 0x0b, 0x8b,
+	0x97, 0x8d, 0x17, 0x52, 0x13, 0xf3, 0x29, 0xbb,
+	0x1c, 0x2f, 0xd3, 0x0f, 0x2f, 0x7f, 0x30, 0x49,
+	0x2a, 0x61, 0xa5, 0x32, 0xa7, 0x9f, 0x51, 0xd3,
+	0x6f, 0x5e, 0x31, 0xa7, 0xc9, 0xa1, 0x2c, 0x28,
+	0x60, 0x82, 0xff, 0x7d, 0x23, 0x94, 0xd1, 0x8f,
+	0x78, 0x3e, 0x1a, 0x8e, 0x72, 0xc7, 0x22, 0xca,
+	0xaa, 0xa5, 0x2d, 0x8f, 0x06, 0x56, 0x57, 0xd2,
+	0x63, 0x1f, 0xd2, 0x5b, 0xfd, 0x8e, 0x5b, 0xaa,
+	0xd6, 0xe5, 0x27, 0xd7, 0x63, 0x51, 0x75, 0x01,
+	0xc6, 0x8c, 0x5e, 0xdc, 0x3c, 0xdd, 0x55, 0x43,
+	0x5c, 0x53, 0x2d, 0x71, 0x25, 0xc8, 0x61, 0x4d,
+	0xee, 0xd9, 0xad, 0xaa, 0x3a, 0xca, 0xde, 0x58,
+	0x88, 0xb8, 0x7b, 0xef, 0x64, 0x1c, 0x4c, 0x99,
+	0x4c, 0x80, 0x91, 0xb5, 0xbc, 0xd3, 0x87, 0xf3,
+	0x96, 0x3f, 0xb5, 0xbc, 0x37, 0xaa, 0x92, 0x2f,
+	0xbf, 0xe3, 0xdf, 0x4e, 0x5b, 0x91, 0x5e, 0x6e,
+	0xb5, 0x14, 0x71, 0x7b, 0xdd, 0x2a, 0x74, 0x07,
+	0x9a, 0x50, 0x73, 0xf5, 0xc4, 0xbf, 0xd4, 0x6a,
+	0xdf, 0x7d, 0x28, 0x2e, 0x7a, 0x39, 0x3a, 0x52,
+	0x57, 0x9d, 0x11, 0xa0, 0x28, 0xda, 0x4d, 0x9c,
+	0xd9, 0xc7, 0x71, 0x24, 0xf9, 0x64, 0x8e, 0xe3,
+	0x83, 0xb1, 0xac, 0x76, 0x39, 0x30, 0xe7, 0x16,
+	0x2a, 0x8d, 0x37, 0xf3, 0x50, 0xb2, 0xf7, 0x4b,
+	0x84, 0x72, 0xcf, 0x09, 0x90, 0x20, 0x63, 0xc6,
+	0xb3, 0x2e, 0x8c, 0x2d, 0x92, 0x90, 0xce, 0xfb,
+	0xd7, 0x34, 0x6d, 0x1c, 0x77, 0x9a, 0x0d, 0xf5,
+	0x0e, 0xdc, 0xde, 0x45, 0x31, 0xda, 0x07, 0xb0,
+	0x99, 0xc6, 0x38, 0xe8, 0x3a, 0x75, 0x59, 0x44,
+	0xdf, 0x2a, 0xef, 0x1a, 0xa3, 0x17, 0x52, 0xfd,
+	0x32, 0x3d, 0xcb, 0x71, 0x0f, 0xb4, 0xbf, 0xbb,
+	0x9d, 0x22, 0xb9, 0x25, 0xbc, 0x35, 0x77, 0xe1,
+	0xb8, 0x94, 0x9e, 0x72, 0x9a, 0x90, 0xbb, 0xaf,
+	0xea, 0xcf, 0x7f, 0x78, 0x79, 0xe7, 0xb1, 0x14,
+	0x7e, 0x28, 0xba, 0x0b, 0xae, 0x94, 0x0d, 0xb7,
+	0x95, 0xa6, 0x1b, 0x15, 0xec, 0xf4, 0xdf, 0x8d,
+	0xb0, 0x7b, 0x82, 0x4b, 0xb0, 0x62, 0x80, 0x2c,
+	0xc9, 0x8a, 0x95, 0x45, 0xbb, 0x2a, 0xae, 0xed,
+	0x77, 0xcb, 0x3f, 0xc6, 0xdb, 0x15, 0xdc, 0xd7,
+	0xd8, 0x0d, 0x7d, 0x5b, 0xc4, 0x06, 0xc4, 0x97,
+	0x0a, 0x34, 0x78, 0xad, 0xa8, 0x89, 0x9b, 0x32,
+	0x91, 0x98, 0xeb, 0x61, 0xc1, 0x93, 0xfb, 0x62,
+	0x75, 0xaa, 0x8c, 0xa3, 0x40, 0x34, 0x4a, 0x75,
+	0xa8, 0x62, 0xae, 0xbe, 0x92, 0xee, 0xe1, 0xce,
+	0x03, 0x2f, 0xd9, 0x50, 0xb4, 0x7d, 0x77, 0x04,
+	0xa3, 0x87, 0x69, 0x23, 0xb4, 0xad, 0x62, 0x84,
+	0x4b, 0xf4, 0xa0, 0x9c, 0x4d, 0xbe, 0x8b, 0x43,
+	0x97, 0x18, 0x4b, 0x74, 0x71, 0x36, 0x0c, 0x95,
+	0x64, 0x88, 0x0a, 0xed, 0xdd, 0xb9, 0xba, 0xa4,
+	0xaf, 0x2e, 0x75, 0x39, 0x4b, 0x08, 0xcd, 0x32,
+	0xff, 0x47, 0x9c, 0x57, 0xa0, 0x7d, 0x3e, 0xab,
+	0x5d, 0x54, 0xde, 0x5f, 0x97, 0x38, 0xb8, 0xd2,
+	0x7f, 0x27, 0xa9, 0xf0, 0xab, 0x11, 0x79, 0x9d,
+	0x7b, 0x7f, 0xfe, 0xfb, 0x27, 0x04, 0xc9, 0x5c,
+	0x6a, 0xd1, 0x2c, 0x39, 0xf1, 0xe8, 0x67, 0xa4,
+	0xb7, 0xb1, 0xd7, 0x81, 0x8a, 0x4b, 0x75, 0x3d,
+	0xfd, 0x2a, 0x89, 0xcc, 0xb4, 0x5e, 0x00, 0x1a,
+	0x03, 0xa8, 0x67, 0xb1, 0x87, 0xf2, 0x25, 0xdd
+};
+
+static uint8_t v6_CTX[512] = {
+
+	0xfa, 0x76, 0x2a, 0x36, 0x80, 0xb7, 0x60, 0x07,
+	0x92, 0x8e, 0xd4, 0xa4, 0xf4, 0x9a, 0x94, 0x56,
+	0x03, 0x1b, 0x70, 0x47, 0x82, 0xe6, 0x5e, 0x16,
+	0xce, 0xcb, 0x54, 0xed, 0x7d, 0x01, 0x7b, 0x5e,
+	0x18, 0xab, 0xd6, 0x7b, 0x33, 0x8e, 0x81, 0x07,
+	0x8f, 0x21, 0xed, 0xb7, 0x86, 0x8d, 0x90, 0x1e,
+	0xbe, 0x9c, 0x73, 0x1a, 0x7c, 0x18, 0xb5, 0xe6,
+	0xde, 0xc1, 0xd6, 0xa7, 0x2e, 0x07, 0x8a, 0xc9,
+	0xa4, 0x26, 0x2f, 0x86, 0x0b, 0xee, 0xfa, 0x14,
+	0xf4, 0xe8, 0x21, 0x01, 0x82, 0x72, 0xe4, 0x11,
+	0xa9, 0x51, 0x50, 0x2b, 0x6e, 0x79, 0x06, 0x6e,
+	0x84, 0x25, 0x2c, 0x33, 0x46, 0xf3, 0xaa, 0x62,
+	0x34, 0x43, 0x51, 0xa2, 0x91, 0xd4, 0xbe, 0xdc,
+	0x7a, 0x07, 0x61, 0x8b, 0xde, 0xa2, 0xaf, 0x63,
+	0x14, 0x5c, 0xc7, 0xa4, 0xb8, 0xd4, 0x07, 0x06,
+	0x91, 0xae, 0x89, 0x0c, 0xd6, 0x57, 0x33, 0xe7,
+	0x94, 0x6e, 0x90, 0x21, 0xa1, 0xdf, 0xfc, 0x4c,
+	0x59, 0xf1, 0x59, 0x42, 0x5e, 0xe6, 0xd5, 0x0c,
+	0xa9, 0xb1, 0x35, 0xfa, 0x61, 0x62, 0xce, 0xa1,
+	0x8a, 0x93, 0x98, 0x38, 0xdc, 0x00, 0x0f, 0xb3,
+	0x86, 0xfa, 0xd0, 0x86, 0xac, 0xce, 0x5a, 0xc0,
+	0x7c, 0xb2, 0xec, 0xe7, 0xfd, 0x58, 0x0b, 0x00,
+	0xcf, 0xa5, 0xe9, 0x85, 0x89, 0x63, 0x1d, 0xc2,
+	0x5e, 0x8e, 0x2a, 0x3d, 0xaf, 0x2f, 0xfd, 0xec,
+	0x26, 0x53, 0x16, 0x59, 0x91, 0x2c, 0x9d, 0x8f,
+	0x7a, 0x15, 0xe5, 0x86, 0x5e, 0xa8, 0xfb, 0x58,
+	0x16, 0xd6, 0x20, 0x70, 0x52, 0xbd, 0x71, 0x28,
+	0xcd, 0x74, 0x3c, 0x12, 0xc8, 0x11, 0x87, 0x91,
+	0xa4, 0x73, 0x68, 0x11, 0x93, 0x5e, 0xb9, 0x82,
+	0xa5, 0x32, 0x34, 0x9e, 0x31, 0xdd, 0x40, 0x1e,
+	0x0b, 0x66, 0x0a, 0x56, 0x8c, 0xb1, 0xa4, 0x71,
+	0x1f, 0x55, 0x2f, 0x55, 0xde, 0xd5, 0x9f, 0x1f,
+	0x15, 0xbf, 0x71, 0x96, 0xb3, 0xca, 0x12, 0xa9,
+	0x1e, 0x48, 0x8e, 0xf5, 0x9d, 0x64, 0xf3, 0xa0,
+	0x2b, 0xf4, 0x52, 0x39, 0x49, 0x9a, 0xc6, 0x17,
+	0x6a, 0xe3, 0x21, 0xc4, 0xa2, 0x11, 0xec, 0x54,
+	0x53, 0x65, 0x97, 0x1c, 0x5d, 0x3f, 0x4f, 0x09,
+	0xd4, 0xeb, 0x13, 0x9b, 0xfd, 0xf2, 0x07, 0x3d,
+	0x33, 0x18, 0x0b, 0x21, 0x00, 0x2b, 0x65, 0xcc,
+	0x98, 0x65, 0xe7, 0x6c, 0xb2, 0x4c, 0xd9, 0x2c,
+	0x87, 0x4c, 0x24, 0xc1, 0x83, 0x50, 0x39, 0x9a,
+	0x93, 0x6a, 0xb3, 0x63, 0x70, 0x79, 0x29, 0x5d,
+	0x76, 0xc4, 0x17, 0x77, 0x6b, 0x94, 0xef, 0xce,
+	0x3a, 0x0e, 0xf7, 0x20, 0x6b, 0x15, 0x11, 0x05,
+	0x19, 0x65, 0x5c, 0x95, 0x6c, 0xbd, 0x8b, 0x24,
+	0x89, 0x40, 0x5e, 0xe2, 0xb0, 0x9a, 0x6b, 0x6e,
+	0xeb, 0xe0, 0xc5, 0x37, 0x90, 0xa1, 0x2a, 0x89,
+	0x98, 0x37, 0x8b, 0x33, 0xa5, 0xb7, 0x11, 0x59,
+	0x62, 0x5f, 0x4b, 0xa4, 0x9d, 0x2a, 0x2f, 0xdb,
+	0xa5, 0x9f, 0xbf, 0x08, 0x97, 0xbc, 0x7a, 0xab,
+	0xd8, 0xd7, 0x07, 0xdc, 0x14, 0x0a, 0x80, 0xf0,
+	0xf3, 0x09, 0xf8, 0x35, 0xd3, 0xda, 0x54, 0xab,
+	0x58, 0x4e, 0x50, 0x1d, 0xfa, 0x0e, 0xe9, 0x77,
+	0xfe, 0xc5, 0x43, 0xf7, 0x41, 0x86, 0xa8, 0x02,
+	0xb9, 0xa3, 0x7a, 0xdb, 0x3e, 0x82, 0x91, 0xec,
+	0xa0, 0x4d, 0x66, 0x52, 0x0d, 0x22, 0x9e, 0x60,
+	0x40, 0x1e, 0x72, 0x82, 0xbe, 0xf4, 0x86, 0xae,
+	0x05, 0x9a, 0xa7, 0x06, 0x96, 0xe0, 0xe3, 0x05,
+	0xd7, 0x77, 0x14, 0x0a, 0x7a, 0x88, 0x3e, 0xcd,
+	0xcb, 0x69, 0xb9, 0xff, 0x93, 0x8e, 0x8a, 0x42,
+	0x31, 0x86, 0x4c, 0x69, 0xca, 0x2c, 0x20, 0x43,
+	0xbe, 0xd0, 0x07, 0xff, 0x3e, 0x60, 0x5e, 0x01,
+	0x4b, 0xcf, 0x51, 0x81, 0x38, 0xdc, 0x3a, 0x25,
+	0xc5, 0xe2, 0x36, 0x17, 0x1a, 0x2d, 0x01, 0xd6
+};
+
+/*
+ * Vector 7
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number fd
+ * PTX 8e41b78c390b5af9d758bb214a67e9f6bf7727b09ac6124084c37611398fa45d
+ * PTX aad94868600ed391fb1acd4857a95b466e62ef9f4b377244d1c152e7b30d731a
+ * PTX ad30c716d214b707aed99eb5b5e580b3e887cf7497465651d4b60e6042051da3
+ * PTX 693c3b78c14489543be8b6ad0ba629565bba202313ba7b0d0c94a3252b676f46
+ * PTX cc02ce0f8a7d34c0ed229129673c1f61aed579d08a9203a25aac3a77e9db6026
+ * PTX 7996db38df637356d9dcd1632e369939f2a29d89345c66e05066f1a3677aef18
+ * PTX dea4113faeb629e46721a66d0a7e785d3e29af2594eb67dfa982affe0aac058f
+ * PTX 6e15864269b135418261fc3afb089472cf68c45dd7f231c6249ba0255e1e0338
+ * PTX 33fc4d00a3fe02132d7bc3873614b8aee34273581ea0325c81f0270affa13641
+ * PTX d052d36f0757d484014354d02d6883ca15c24d8c3956b1bd027bcf41f151fd80
+ * PTX 23c5340e5606f37e90fdb87c86fb4fa634b3718a30bace06a66eaf8f63c4aa3b
+ * PTX 637826a87fe8cfa44282e92cb1615af3a28e53bc74c7cba1a0977be9065d0c1a
+ * PTX 5dec6c54ae38d37f37aa35283e048e5530a85c4e7a29d7b92ec0c3169cdf2a80
+ * PTX 5c7604bce60049b9fb7b8eaac10f51ae23794ceba68bb58112e293b9b692ca72
+ * PTX 1b37c662f8574ed4dba6f88e170881c82cddc1034a0ca7e284bf0962b6b26292
+ * PTX d836fa9f73c1ac770eef0f2d3a1eaf61d3e03555fd424eedd67e18a18094f888
+ * CTX d55f684f81f4426e9fde92a5ff02df2ac896af63962888a97910c1379e20b0a3
+ * CTX b1db613fb7fe2e07004329ea5c22bfd33e3dbe4cf58cc608c2c26c19a2e2fe22
+ * CTX f98732c2b5cb844cc6c0702d91e1d50fc4382a7eba5635cd602432a2306ac4ce
+ * CTX 82f8d70c8d9bc15f918fe71e74c622d5cf71178bf6e0b9cc9f2b41dd8dbe441c
+ * CTX 41cd0c73a6dc47a348f6702f9d0e9b1b1431e948e299b9ec2272ab2c5f0c7be8
+ * CTX 6affa5dec87a0bee81d3d50007edaa2bcfccb35605155ff36ed8edd4a40dcd4b
+ * CTX 243acd11b2b987bdbfaf91a7cac27e9c5aea525ee53de7b2d3332c8644402b82
+ * CTX 3e94a7db26276d2d23aa07180f76b4fd29b9c0823099c9d62c519880aee7e969
+ * CTX 7617c1497d47bf3e571950311421b6b734d38b0db91eb85331b91ea9f61530f5
+ * CTX 4512a5a52a4bad589eb69781d537f23297bb459bdad2948a29e1550bf4787e0b
+ * CTX e95bb173cf5fab17dab7a13a052a63453d97ccec1a321954886b7a1299faaeec
+ * CTX ae35c6eaaca753b041b5e5f093bf83397fd21dd6b3012066fcc058cc32c3b09d
+ * CTX 7562dee29509b5839392c9ff05f51f3166aaac4ac5f238038a3045e6f72e48ef
+ * CTX 0fe8bc675e82c318a268e43970271bf119b81bf6a982746554f84e72b9f00280
+ * CTX a320a08142923c23c883423ff949827f29bbacdc1ccdb04938ce6098c95ba6b3
+ * CTX 2528f4ef78eed778b2e122ddfd1cbdd11d1c0a6783e011fc536d63d053260637
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v7_key1[16] = {
+
+	0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+	0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v7_key2[16] = {
+
+	0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+	0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v7_TW[16] = {
+
+	0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v7_PTX[512] = {
+
+	0x8e, 0x41, 0xb7, 0x8c, 0x39, 0x0b, 0x5a, 0xf9,
+	0xd7, 0x58, 0xbb, 0x21, 0x4a, 0x67, 0xe9, 0xf6,
+	0xbf, 0x77, 0x27, 0xb0, 0x9a, 0xc6, 0x12, 0x40,
+	0x84, 0xc3, 0x76, 0x11, 0x39, 0x8f, 0xa4, 0x5d,
+	0xaa, 0xd9, 0x48, 0x68, 0x60, 0x0e, 0xd3, 0x91,
+	0xfb, 0x1a, 0xcd, 0x48, 0x57, 0xa9, 0x5b, 0x46,
+	0x6e, 0x62, 0xef, 0x9f, 0x4b, 0x37, 0x72, 0x44,
+	0xd1, 0xc1, 0x52, 0xe7, 0xb3, 0x0d, 0x73, 0x1a,
+	0xad, 0x30, 0xc7, 0x16, 0xd2, 0x14, 0xb7, 0x07,
+	0xae, 0xd9, 0x9e, 0xb5, 0xb5, 0xe5, 0x80, 0xb3,
+	0xe8, 0x87, 0xcf, 0x74, 0x97, 0x46, 0x56, 0x51,
+	0xd4, 0xb6, 0x0e, 0x60, 0x42, 0x05, 0x1d, 0xa3,
+	0x69, 0x3c, 0x3b, 0x78, 0xc1, 0x44, 0x89, 0x54,
+	0x3b, 0xe8, 0xb6, 0xad, 0x0b, 0xa6, 0x29, 0x56,
+	0x5b, 0xba, 0x20, 0x23, 0x13, 0xba, 0x7b, 0x0d,
+	0x0c, 0x94, 0xa3, 0x25, 0x2b, 0x67, 0x6f, 0x46,
+	0xcc, 0x02, 0xce, 0x0f, 0x8a, 0x7d, 0x34, 0xc0,
+	0xed, 0x22, 0x91, 0x29, 0x67, 0x3c, 0x1f, 0x61,
+	0xae, 0xd5, 0x79, 0xd0, 0x8a, 0x92, 0x03, 0xa2,
+	0x5a, 0xac, 0x3a, 0x77, 0xe9, 0xdb, 0x60, 0x26,
+	0x79, 0x96, 0xdb, 0x38, 0xdf, 0x63, 0x73, 0x56,
+	0xd9, 0xdc, 0xd1, 0x63, 0x2e, 0x36, 0x99, 0x39,
+	0xf2, 0xa2, 0x9d, 0x89, 0x34, 0x5c, 0x66, 0xe0,
+	0x50, 0x66, 0xf1, 0xa3, 0x67, 0x7a, 0xef, 0x18,
+	0xde, 0xa4, 0x11, 0x3f, 0xae, 0xb6, 0x29, 0xe4,
+	0x67, 0x21, 0xa6, 0x6d, 0x0a, 0x7e, 0x78, 0x5d,
+	0x3e, 0x29, 0xaf, 0x25, 0x94, 0xeb, 0x67, 0xdf,
+	0xa9, 0x82, 0xaf, 0xfe, 0x0a, 0xac, 0x05, 0x8f,
+	0x6e, 0x15, 0x86, 0x42, 0x69, 0xb1, 0x35, 0x41,
+	0x82, 0x61, 0xfc, 0x3a, 0xfb, 0x08, 0x94, 0x72,
+	0xcf, 0x68, 0xc4, 0x5d, 0xd7, 0xf2, 0x31, 0xc6,
+	0x24, 0x9b, 0xa0, 0x25, 0x5e, 0x1e, 0x03, 0x38,
+	0x33, 0xfc, 0x4d, 0x00, 0xa3, 0xfe, 0x02, 0x13,
+	0x2d, 0x7b, 0xc3, 0x87, 0x36, 0x14, 0xb8, 0xae,
+	0xe3, 0x42, 0x73, 0x58, 0x1e, 0xa0, 0x32, 0x5c,
+	0x81, 0xf0, 0x27, 0x0a, 0xff, 0xa1, 0x36, 0x41,
+	0xd0, 0x52, 0xd3, 0x6f, 0x07, 0x57, 0xd4, 0x84,
+	0x01, 0x43, 0x54, 0xd0, 0x2d, 0x68, 0x83, 0xca,
+	0x15, 0xc2, 0x4d, 0x8c, 0x39, 0x56, 0xb1, 0xbd,
+	0x02, 0x7b, 0xcf, 0x41, 0xf1, 0x51, 0xfd, 0x80,
+	0x23, 0xc5, 0x34, 0x0e, 0x56, 0x06, 0xf3, 0x7e,
+	0x90, 0xfd, 0xb8, 0x7c, 0x86, 0xfb, 0x4f, 0xa6,
+	0x34, 0xb3, 0x71, 0x8a, 0x30, 0xba, 0xce, 0x06,
+	0xa6, 0x6e, 0xaf, 0x8f, 0x63, 0xc4, 0xaa, 0x3b,
+	0x63, 0x78, 0x26, 0xa8, 0x7f, 0xe8, 0xcf, 0xa4,
+	0x42, 0x82, 0xe9, 0x2c, 0xb1, 0x61, 0x5a, 0xf3,
+	0xa2, 0x8e, 0x53, 0xbc, 0x74, 0xc7, 0xcb, 0xa1,
+	0xa0, 0x97, 0x7b, 0xe9, 0x06, 0x5d, 0x0c, 0x1a,
+	0x5d, 0xec, 0x6c, 0x54, 0xae, 0x38, 0xd3, 0x7f,
+	0x37, 0xaa, 0x35, 0x28, 0x3e, 0x04, 0x8e, 0x55,
+	0x30, 0xa8, 0x5c, 0x4e, 0x7a, 0x29, 0xd7, 0xb9,
+	0x2e, 0xc0, 0xc3, 0x16, 0x9c, 0xdf, 0x2a, 0x80,
+	0x5c, 0x76, 0x04, 0xbc, 0xe6, 0x00, 0x49, 0xb9,
+	0xfb, 0x7b, 0x8e, 0xaa, 0xc1, 0x0f, 0x51, 0xae,
+	0x23, 0x79, 0x4c, 0xeb, 0xa6, 0x8b, 0xb5, 0x81,
+	0x12, 0xe2, 0x93, 0xb9, 0xb6, 0x92, 0xca, 0x72,
+	0x1b, 0x37, 0xc6, 0x62, 0xf8, 0x57, 0x4e, 0xd4,
+	0xdb, 0xa6, 0xf8, 0x8e, 0x17, 0x08, 0x81, 0xc8,
+	0x2c, 0xdd, 0xc1, 0x03, 0x4a, 0x0c, 0xa7, 0xe2,
+	0x84, 0xbf, 0x09, 0x62, 0xb6, 0xb2, 0x62, 0x92,
+	0xd8, 0x36, 0xfa, 0x9f, 0x73, 0xc1, 0xac, 0x77,
+	0x0e, 0xef, 0x0f, 0x2d, 0x3a, 0x1e, 0xaf, 0x61,
+	0xd3, 0xe0, 0x35, 0x55, 0xfd, 0x42, 0x4e, 0xed,
+	0xd6, 0x7e, 0x18, 0xa1, 0x80, 0x94, 0xf8, 0x88
+};
+
+static uint8_t v7_CTX[512] = {
+
+	0xd5, 0x5f, 0x68, 0x4f, 0x81, 0xf4, 0x42, 0x6e,
+	0x9f, 0xde, 0x92, 0xa5, 0xff, 0x02, 0xdf, 0x2a,
+	0xc8, 0x96, 0xaf, 0x63, 0x96, 0x28, 0x88, 0xa9,
+	0x79, 0x10, 0xc1, 0x37, 0x9e, 0x20, 0xb0, 0xa3,
+	0xb1, 0xdb, 0x61, 0x3f, 0xb7, 0xfe, 0x2e, 0x07,
+	0x00, 0x43, 0x29, 0xea, 0x5c, 0x22, 0xbf, 0xd3,
+	0x3e, 0x3d, 0xbe, 0x4c, 0xf5, 0x8c, 0xc6, 0x08,
+	0xc2, 0xc2, 0x6c, 0x19, 0xa2, 0xe2, 0xfe, 0x22,
+	0xf9, 0x87, 0x32, 0xc2, 0xb5, 0xcb, 0x84, 0x4c,
+	0xc6, 0xc0, 0x70, 0x2d, 0x91, 0xe1, 0xd5, 0x0f,
+	0xc4, 0x38, 0x2a, 0x7e, 0xba, 0x56, 0x35, 0xcd,
+	0x60, 0x24, 0x32, 0xa2, 0x30, 0x6a, 0xc4, 0xce,
+	0x82, 0xf8, 0xd7, 0x0c, 0x8d, 0x9b, 0xc1, 0x5f,
+	0x91, 0x8f, 0xe7, 0x1e, 0x74, 0xc6, 0x22, 0xd5,
+	0xcf, 0x71, 0x17, 0x8b, 0xf6, 0xe0, 0xb9, 0xcc,
+	0x9f, 0x2b, 0x41, 0xdd, 0x8d, 0xbe, 0x44, 0x1c,
+	0x41, 0xcd, 0x0c, 0x73, 0xa6, 0xdc, 0x47, 0xa3,
+	0x48, 0xf6, 0x70, 0x2f, 0x9d, 0x0e, 0x9b, 0x1b,
+	0x14, 0x31, 0xe9, 0x48, 0xe2, 0x99, 0xb9, 0xec,
+	0x22, 0x72, 0xab, 0x2c, 0x5f, 0x0c, 0x7b, 0xe8,
+	0x6a, 0xff, 0xa5, 0xde, 0xc8, 0x7a, 0x0b, 0xee,
+	0x81, 0xd3, 0xd5, 0x00, 0x07, 0xed, 0xaa, 0x2b,
+	0xcf, 0xcc, 0xb3, 0x56, 0x05, 0x15, 0x5f, 0xf3,
+	0x6e, 0xd8, 0xed, 0xd4, 0xa4, 0x0d, 0xcd, 0x4b,
+	0x24, 0x3a, 0xcd, 0x11, 0xb2, 0xb9, 0x87, 0xbd,
+	0xbf, 0xaf, 0x91, 0xa7, 0xca, 0xc2, 0x7e, 0x9c,
+	0x5a, 0xea, 0x52, 0x5e, 0xe5, 0x3d, 0xe7, 0xb2,
+	0xd3, 0x33, 0x2c, 0x86, 0x44, 0x40, 0x2b, 0x82,
+	0x3e, 0x94, 0xa7, 0xdb, 0x26, 0x27, 0x6d, 0x2d,
+	0x23, 0xaa, 0x07, 0x18, 0x0f, 0x76, 0xb4, 0xfd,
+	0x29, 0xb9, 0xc0, 0x82, 0x30, 0x99, 0xc9, 0xd6,
+	0x2c, 0x51, 0x98, 0x80, 0xae, 0xe7, 0xe9, 0x69,
+	0x76, 0x17, 0xc1, 0x49, 0x7d, 0x47, 0xbf, 0x3e,
+	0x57, 0x19, 0x50, 0x31, 0x14, 0x21, 0xb6, 0xb7,
+	0x34, 0xd3, 0x8b, 0x0d, 0xb9, 0x1e, 0xb8, 0x53,
+	0x31, 0xb9, 0x1e, 0xa9, 0xf6, 0x15, 0x30, 0xf5,
+	0x45, 0x12, 0xa5, 0xa5, 0x2a, 0x4b, 0xad, 0x58,
+	0x9e, 0xb6, 0x97, 0x81, 0xd5, 0x37, 0xf2, 0x32,
+	0x97, 0xbb, 0x45, 0x9b, 0xda, 0xd2, 0x94, 0x8a,
+	0x29, 0xe1, 0x55, 0x0b, 0xf4, 0x78, 0x7e, 0x0b,
+	0xe9, 0x5b, 0xb1, 0x73, 0xcf, 0x5f, 0xab, 0x17,
+	0xda, 0xb7, 0xa1, 0x3a, 0x05, 0x2a, 0x63, 0x45,
+	0x3d, 0x97, 0xcc, 0xec, 0x1a, 0x32, 0x19, 0x54,
+	0x88, 0x6b, 0x7a, 0x12, 0x99, 0xfa, 0xae, 0xec,
+	0xae, 0x35, 0xc6, 0xea, 0xac, 0xa7, 0x53, 0xb0,
+	0x41, 0xb5, 0xe5, 0xf0, 0x93, 0xbf, 0x83, 0x39,
+	0x7f, 0xd2, 0x1d, 0xd6, 0xb3, 0x01, 0x20, 0x66,
+	0xfc, 0xc0, 0x58, 0xcc, 0x32, 0xc3, 0xb0, 0x9d,
+	0x75, 0x62, 0xde, 0xe2, 0x95, 0x09, 0xb5, 0x83,
+	0x93, 0x92, 0xc9, 0xff, 0x05, 0xf5, 0x1f, 0x31,
+	0x66, 0xaa, 0xac, 0x4a, 0xc5, 0xf2, 0x38, 0x03,
+	0x8a, 0x30, 0x45, 0xe6, 0xf7, 0x2e, 0x48, 0xef,
+	0x0f, 0xe8, 0xbc, 0x67, 0x5e, 0x82, 0xc3, 0x18,
+	0xa2, 0x68, 0xe4, 0x39, 0x70, 0x27, 0x1b, 0xf1,
+	0x19, 0xb8, 0x1b, 0xf6, 0xa9, 0x82, 0x74, 0x65,
+	0x54, 0xf8, 0x4e, 0x72, 0xb9, 0xf0, 0x02, 0x80,
+	0xa3, 0x20, 0xa0, 0x81, 0x42, 0x92, 0x3c, 0x23,
+	0xc8, 0x83, 0x42, 0x3f, 0xf9, 0x49, 0x82, 0x7f,
+	0x29, 0xbb, 0xac, 0xdc, 0x1c, 0xcd, 0xb0, 0x49,
+	0x38, 0xce, 0x60, 0x98, 0xc9, 0x5b, 0xa6, 0xb3,
+	0x25, 0x28, 0xf4, 0xef, 0x78, 0xee, 0xd7, 0x78,
+	0xb2, 0xe1, 0x22, 0xdd, 0xfd, 0x1c, 0xbd, 0xd1,
+	0x1d, 0x1c, 0x0a, 0x67, 0x83, 0xe0, 0x11, 0xfc,
+	0x53, 0x6d, 0x63, 0xd0, 0x53, 0x26, 0x06, 0x37
+};
+
+/*
+ * Vector 8
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number fe
+ * PTX d55f684f81f4426e9fde92a5ff02df2ac896af63962888a97910c1379e20b0a3
+ * PTX b1db613fb7fe2e07004329ea5c22bfd33e3dbe4cf58cc608c2c26c19a2e2fe22
+ * PTX f98732c2b5cb844cc6c0702d91e1d50fc4382a7eba5635cd602432a2306ac4ce
+ * PTX 82f8d70c8d9bc15f918fe71e74c622d5cf71178bf6e0b9cc9f2b41dd8dbe441c
+ * PTX 41cd0c73a6dc47a348f6702f9d0e9b1b1431e948e299b9ec2272ab2c5f0c7be8
+ * PTX 6affa5dec87a0bee81d3d50007edaa2bcfccb35605155ff36ed8edd4a40dcd4b
+ * PTX 243acd11b2b987bdbfaf91a7cac27e9c5aea525ee53de7b2d3332c8644402b82
+ * PTX 3e94a7db26276d2d23aa07180f76b4fd29b9c0823099c9d62c519880aee7e969
+ * PTX 7617c1497d47bf3e571950311421b6b734d38b0db91eb85331b91ea9f61530f5
+ * PTX 4512a5a52a4bad589eb69781d537f23297bb459bdad2948a29e1550bf4787e0b
+ * PTX e95bb173cf5fab17dab7a13a052a63453d97ccec1a321954886b7a1299faaeec
+ * PTX ae35c6eaaca753b041b5e5f093bf83397fd21dd6b3012066fcc058cc32c3b09d
+ * PTX 7562dee29509b5839392c9ff05f51f3166aaac4ac5f238038a3045e6f72e48ef
+ * PTX 0fe8bc675e82c318a268e43970271bf119b81bf6a982746554f84e72b9f00280
+ * PTX a320a08142923c23c883423ff949827f29bbacdc1ccdb04938ce6098c95ba6b3
+ * PTX 2528f4ef78eed778b2e122ddfd1cbdd11d1c0a6783e011fc536d63d053260637
+ * CTX 72efc1ebfe1ee25975a6eb3aa8589dda2b261f1c85bdab442a9e5b2dd1d7c395
+ * CTX 7a16fc08e526d4b1223f1b1232a11af274c3d70dac57f83e0983c498f1a6f1ae
+ * CTX cb021c3e70085a1e527f1ce41ee5911a82020161529cd82773762daf5459de94
+ * CTX a0a82adae7e1703c808543c29ed6fb32d9e004327c1355180c995a07741493a0
+ * CTX 9c21ba01a387882da4f62534b87bb15d60d197201c0fd3bf30c1500a3ecfecdd
+ * CTX 66d8721f90bcc4c17ee925c61b0a03727a9c0d5f5ca462fbfa0af1c2513a9d9d
+ * CTX 4b5345bd27a5f6e653f751693e6b6a2b8ead57d511e00e58c45b7b8d005af792
+ * CTX 88f5c7c22fd4f1bf7a898b03a5634c6a1ae3f9fae5de4f296a2896b23e7ed43e
+ * CTX d14fa5a2803f4d28f0d3ffcf24757677aebdb47bb388378708948a8d4126ed18
+ * CTX 39e0da29a537a8c198b3c66ab00712dd261674bf45a73d67f76914f830ca014b
+ * CTX 65596f27e4cf62de66125a5566df9975155628b400fbfb3a29040ed50faffdbb
+ * CTX 18aece7c5c44693260aab386c0a37b11b114f1c415aebb653be468179428d43a
+ * CTX 4d8bc3ec38813eca30a13cf1bb18d524f1992d44d8b1a42ea30b22e6c95b199d
+ * CTX 8d182f8840b09d059585c31ad691fa0619ff038aca2c39a943421157361717c4
+ * CTX 9d322028a74648113bd8c9d7ec77cf3c89c1ec8718ceff8516d96b34c3c614f1
+ * CTX 0699c9abc4ed0411506223bea16af35c883accdbe1104eef0cfdb54e12fb230a
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v8_key1[16] = {
+	0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+	0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v8_key2[16] = {
+	0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+	0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v8_TW[16] = {
+	0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v8_PTX[512] = {
+	0xd5, 0x5f, 0x68, 0x4f, 0x81, 0xf4, 0x42, 0x6e,
+	0x9f, 0xde, 0x92, 0xa5, 0xff, 0x02, 0xdf, 0x2a,
+	0xc8, 0x96, 0xaf, 0x63, 0x96, 0x28, 0x88, 0xa9,
+	0x79, 0x10, 0xc1, 0x37, 0x9e, 0x20, 0xb0, 0xa3,
+	0xb1, 0xdb, 0x61, 0x3f, 0xb7, 0xfe, 0x2e, 0x07,
+	0x00, 0x43, 0x29, 0xea, 0x5c, 0x22, 0xbf, 0xd3,
+	0x3e, 0x3d, 0xbe, 0x4c, 0xf5, 0x8c, 0xc6, 0x08,
+	0xc2, 0xc2, 0x6c, 0x19, 0xa2, 0xe2, 0xfe, 0x22,
+	0xf9, 0x87, 0x32, 0xc2, 0xb5, 0xcb, 0x84, 0x4c,
+	0xc6, 0xc0, 0x70, 0x2d, 0x91, 0xe1, 0xd5, 0x0f,
+	0xc4, 0x38, 0x2a, 0x7e, 0xba, 0x56, 0x35, 0xcd,
+	0x60, 0x24, 0x32, 0xa2, 0x30, 0x6a, 0xc4, 0xce,
+	0x82, 0xf8, 0xd7, 0x0c, 0x8d, 0x9b, 0xc1, 0x5f,
+	0x91, 0x8f, 0xe7, 0x1e, 0x74, 0xc6, 0x22, 0xd5,
+	0xcf, 0x71, 0x17, 0x8b, 0xf6, 0xe0, 0xb9, 0xcc,
+	0x9f, 0x2b, 0x41, 0xdd, 0x8d, 0xbe, 0x44, 0x1c,
+	0x41, 0xcd, 0x0c, 0x73, 0xa6, 0xdc, 0x47, 0xa3,
+	0x48, 0xf6, 0x70, 0x2f, 0x9d, 0x0e, 0x9b, 0x1b,
+	0x14, 0x31, 0xe9, 0x48, 0xe2, 0x99, 0xb9, 0xec,
+	0x22, 0x72, 0xab, 0x2c, 0x5f, 0x0c, 0x7b, 0xe8,
+	0x6a, 0xff, 0xa5, 0xde, 0xc8, 0x7a, 0x0b, 0xee,
+	0x81, 0xd3, 0xd5, 0x00, 0x07, 0xed, 0xaa, 0x2b,
+	0xcf, 0xcc, 0xb3, 0x56, 0x05, 0x15, 0x5f, 0xf3,
+	0x6e, 0xd8, 0xed, 0xd4, 0xa4, 0x0d, 0xcd, 0x4b,
+	0x24, 0x3a, 0xcd, 0x11, 0xb2, 0xb9, 0x87, 0xbd,
+	0xbf, 0xaf, 0x91, 0xa7, 0xca, 0xc2, 0x7e, 0x9c,
+	0x5a, 0xea, 0x52, 0x5e, 0xe5, 0x3d, 0xe7, 0xb2,
+	0xd3, 0x33, 0x2c, 0x86, 0x44, 0x40, 0x2b, 0x82,
+	0x3e, 0x94, 0xa7, 0xdb, 0x26, 0x27, 0x6d, 0x2d,
+	0x23, 0xaa, 0x07, 0x18, 0x0f, 0x76, 0xb4, 0xfd,
+	0x29, 0xb9, 0xc0, 0x82, 0x30, 0x99, 0xc9, 0xd6,
+	0x2c, 0x51, 0x98, 0x80, 0xae, 0xe7, 0xe9, 0x69,
+	0x76, 0x17, 0xc1, 0x49, 0x7d, 0x47, 0xbf, 0x3e,
+	0x57, 0x19, 0x50, 0x31, 0x14, 0x21, 0xb6, 0xb7,
+	0x34, 0xd3, 0x8b, 0x0d, 0xb9, 0x1e, 0xb8, 0x53,
+	0x31, 0xb9, 0x1e, 0xa9, 0xf6, 0x15, 0x30, 0xf5,
+	0x45, 0x12, 0xa5, 0xa5, 0x2a, 0x4b, 0xad, 0x58,
+	0x9e, 0xb6, 0x97, 0x81, 0xd5, 0x37, 0xf2, 0x32,
+	0x97, 0xbb, 0x45, 0x9b, 0xda, 0xd2, 0x94, 0x8a,
+	0x29, 0xe1, 0x55, 0x0b, 0xf4, 0x78, 0x7e, 0x0b,
+	0xe9, 0x5b, 0xb1, 0x73, 0xcf, 0x5f, 0xab, 0x17,
+	0xda, 0xb7, 0xa1, 0x3a, 0x05, 0x2a, 0x63, 0x45,
+	0x3d, 0x97, 0xcc, 0xec, 0x1a, 0x32, 0x19, 0x54,
+	0x88, 0x6b, 0x7a, 0x12, 0x99, 0xfa, 0xae, 0xec,
+	0xae, 0x35, 0xc6, 0xea, 0xac, 0xa7, 0x53, 0xb0,
+	0x41, 0xb5, 0xe5, 0xf0, 0x93, 0xbf, 0x83, 0x39,
+	0x7f, 0xd2, 0x1d, 0xd6, 0xb3, 0x01, 0x20, 0x66,
+	0xfc, 0xc0, 0x58, 0xcc, 0x32, 0xc3, 0xb0, 0x9d,
+	0x75, 0x62, 0xde, 0xe2, 0x95, 0x09, 0xb5, 0x83,
+	0x93, 0x92, 0xc9, 0xff, 0x05, 0xf5, 0x1f, 0x31,
+	0x66, 0xaa, 0xac, 0x4a, 0xc5, 0xf2, 0x38, 0x03,
+	0x8a, 0x30, 0x45, 0xe6, 0xf7, 0x2e, 0x48, 0xef,
+	0x0f, 0xe8, 0xbc, 0x67, 0x5e, 0x82, 0xc3, 0x18,
+	0xa2, 0x68, 0xe4, 0x39, 0x70, 0x27, 0x1b, 0xf1,
+	0x19, 0xb8, 0x1b, 0xf6, 0xa9, 0x82, 0x74, 0x65,
+	0x54, 0xf8, 0x4e, 0x72, 0xb9, 0xf0, 0x02, 0x80,
+	0xa3, 0x20, 0xa0, 0x81, 0x42, 0x92, 0x3c, 0x23,
+	0xc8, 0x83, 0x42, 0x3f, 0xf9, 0x49, 0x82, 0x7f,
+	0x29, 0xbb, 0xac, 0xdc, 0x1c, 0xcd, 0xb0, 0x49,
+	0x38, 0xce, 0x60, 0x98, 0xc9, 0x5b, 0xa6, 0xb3,
+	0x25, 0x28, 0xf4, 0xef, 0x78, 0xee, 0xd7, 0x78,
+	0xb2, 0xe1, 0x22, 0xdd, 0xfd, 0x1c, 0xbd, 0xd1,
+	0x1d, 0x1c, 0x0a, 0x67, 0x83, 0xe0, 0x11, 0xfc,
+	0x53, 0x6d, 0x63, 0xd0, 0x53, 0x26, 0x06, 0x37
+};
+
+static uint8_t v8_CTX[512] = {
+	0x72, 0xef, 0xc1, 0xeb, 0xfe, 0x1e, 0xe2, 0x59,
+	0x75, 0xa6, 0xeb, 0x3a, 0xa8, 0x58, 0x9d, 0xda,
+	0x2b, 0x26, 0x1f, 0x1c, 0x85, 0xbd, 0xab, 0x44,
+	0x2a, 0x9e, 0x5b, 0x2d, 0xd1, 0xd7, 0xc3, 0x95,
+	0x7a, 0x16, 0xfc, 0x08, 0xe5, 0x26, 0xd4, 0xb1,
+	0x22, 0x3f, 0x1b, 0x12, 0x32, 0xa1, 0x1a, 0xf2,
+	0x74, 0xc3, 0xd7, 0x0d, 0xac, 0x57, 0xf8, 0x3e,
+	0x09, 0x83, 0xc4, 0x98, 0xf1, 0xa6, 0xf1, 0xae,
+	0xcb, 0x02, 0x1c, 0x3e, 0x70, 0x08, 0x5a, 0x1e,
+	0x52, 0x7f, 0x1c, 0xe4, 0x1e, 0xe5, 0x91, 0x1a,
+	0x82, 0x02, 0x01, 0x61, 0x52, 0x9c, 0xd8, 0x27,
+	0x73, 0x76, 0x2d, 0xaf, 0x54, 0x59, 0xde, 0x94,
+	0xa0, 0xa8, 0x2a, 0xda, 0xe7, 0xe1, 0x70, 0x3c,
+	0x80, 0x85, 0x43, 0xc2, 0x9e, 0xd6, 0xfb, 0x32,
+	0xd9, 0xe0, 0x04, 0x32, 0x7c, 0x13, 0x55, 0x18,
+	0x0c, 0x99, 0x5a, 0x07, 0x74, 0x14, 0x93, 0xa0,
+	0x9c, 0x21, 0xba, 0x01, 0xa3, 0x87, 0x88, 0x2d,
+	0xa4, 0xf6, 0x25, 0x34, 0xb8, 0x7b, 0xb1, 0x5d,
+	0x60, 0xd1, 0x97, 0x20, 0x1c, 0x0f, 0xd3, 0xbf,
+	0x30, 0xc1, 0x50, 0x0a, 0x3e, 0xcf, 0xec, 0xdd,
+	0x66, 0xd8, 0x72, 0x1f, 0x90, 0xbc, 0xc4, 0xc1,
+	0x7e, 0xe9, 0x25, 0xc6, 0x1b, 0x0a, 0x03, 0x72,
+	0x7a, 0x9c, 0x0d, 0x5f, 0x5c, 0xa4, 0x62, 0xfb,
+	0xfa, 0x0a, 0xf1, 0xc2, 0x51, 0x3a, 0x9d, 0x9d,
+	0x4b, 0x53, 0x45, 0xbd, 0x27, 0xa5, 0xf6, 0xe6,
+	0x53, 0xf7, 0x51, 0x69, 0x3e, 0x6b, 0x6a, 0x2b,
+	0x8e, 0xad, 0x57, 0xd5, 0x11, 0xe0, 0x0e, 0x58,
+	0xc4, 0x5b, 0x7b, 0x8d, 0x00, 0x5a, 0xf7, 0x92,
+	0x88, 0xf5, 0xc7, 0xc2, 0x2f, 0xd4, 0xf1, 0xbf,
+	0x7a, 0x89, 0x8b, 0x03, 0xa5, 0x63, 0x4c, 0x6a,
+	0x1a, 0xe3, 0xf9, 0xfa, 0xe5, 0xde, 0x4f, 0x29,
+	0x6a, 0x28, 0x96, 0xb2, 0x3e, 0x7e, 0xd4, 0x3e,
+	0xd1, 0x4f, 0xa5, 0xa2, 0x80, 0x3f, 0x4d, 0x28,
+	0xf0, 0xd3, 0xff, 0xcf, 0x24, 0x75, 0x76, 0x77,
+	0xae, 0xbd, 0xb4, 0x7b, 0xb3, 0x88, 0x37, 0x87,
+	0x08, 0x94, 0x8a, 0x8d, 0x41, 0x26, 0xed, 0x18,
+	0x39, 0xe0, 0xda, 0x29, 0xa5, 0x37, 0xa8, 0xc1,
+	0x98, 0xb3, 0xc6, 0x6a, 0xb0, 0x07, 0x12, 0xdd,
+	0x26, 0x16, 0x74, 0xbf, 0x45, 0xa7, 0x3d, 0x67,
+	0xf7, 0x69, 0x14, 0xf8, 0x30, 0xca, 0x01, 0x4b,
+	0x65, 0x59, 0x6f, 0x27, 0xe4, 0xcf, 0x62, 0xde,
+	0x66, 0x12, 0x5a, 0x55, 0x66, 0xdf, 0x99, 0x75,
+	0x15, 0x56, 0x28, 0xb4, 0x00, 0xfb, 0xfb, 0x3a,
+	0x29, 0x04, 0x0e, 0xd5, 0x0f, 0xaf, 0xfd, 0xbb,
+	0x18, 0xae, 0xce, 0x7c, 0x5c, 0x44, 0x69, 0x32,
+	0x60, 0xaa, 0xb3, 0x86, 0xc0, 0xa3, 0x7b, 0x11,
+	0xb1, 0x14, 0xf1, 0xc4, 0x15, 0xae, 0xbb, 0x65,
+	0x3b, 0xe4, 0x68, 0x17, 0x94, 0x28, 0xd4, 0x3a,
+	0x4d, 0x8b, 0xc3, 0xec, 0x38, 0x81, 0x3e, 0xca,
+	0x30, 0xa1, 0x3c, 0xf1, 0xbb, 0x18, 0xd5, 0x24,
+	0xf1, 0x99, 0x2d, 0x44, 0xd8, 0xb1, 0xa4, 0x2e,
+	0xa3, 0x0b, 0x22, 0xe6, 0xc9, 0x5b, 0x19, 0x9d,
+	0x8d, 0x18, 0x2f, 0x88, 0x40, 0xb0, 0x9d, 0x05,
+	0x95, 0x85, 0xc3, 0x1a, 0xd6, 0x91, 0xfa, 0x06,
+	0x19, 0xff, 0x03, 0x8a, 0xca, 0x2c, 0x39, 0xa9,
+	0x43, 0x42, 0x11, 0x57, 0x36, 0x17, 0x17, 0xc4,
+	0x9d, 0x32, 0x20, 0x28, 0xa7, 0x46, 0x48, 0x11,
+	0x3b, 0xd8, 0xc9, 0xd7, 0xec, 0x77, 0xcf, 0x3c,
+	0x89, 0xc1, 0xec, 0x87, 0x18, 0xce, 0xff, 0x85,
+	0x16, 0xd9, 0x6b, 0x34, 0xc3, 0xc6, 0x14, 0xf1,
+	0x06, 0x99, 0xc9, 0xab, 0xc4, 0xed, 0x04, 0x11,
+	0x50, 0x62, 0x23, 0xbe, 0xa1, 0x6a, 0xf3, 0x5c,
+	0x88, 0x3a, 0xcc, 0xdb, 0xe1, 0x10, 0x4e, 0xef,
+	0x0c, 0xfd, 0xb5, 0x4e, 0x12, 0xfb, 0x23, 0x0a
+};
+
+/*
+ * Vector 9
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number ff
+ * PTX 72efc1ebfe1ee25975a6eb3aa8589dda2b261f1c85bdab442a9e5b2dd1d7c395
+ * PTX 7a16fc08e526d4b1223f1b1232a11af274c3d70dac57f83e0983c498f1a6f1ae
+ * PTX cb021c3e70085a1e527f1ce41ee5911a82020161529cd82773762daf5459de94
+ * PTX a0a82adae7e1703c808543c29ed6fb32d9e004327c1355180c995a07741493a0
+ * PTX 9c21ba01a387882da4f62534b87bb15d60d197201c0fd3bf30c1500a3ecfecdd
+ * PTX 66d8721f90bcc4c17ee925c61b0a03727a9c0d5f5ca462fbfa0af1c2513a9d9d
+ * PTX 4b5345bd27a5f6e653f751693e6b6a2b8ead57d511e00e58c45b7b8d005af792
+ * PTX 88f5c7c22fd4f1bf7a898b03a5634c6a1ae3f9fae5de4f296a2896b23e7ed43e
+ * PTX d14fa5a2803f4d28f0d3ffcf24757677aebdb47bb388378708948a8d4126ed18
+ * PTX 39e0da29a537a8c198b3c66ab00712dd261674bf45a73d67f76914f830ca014b
+ * PTX 65596f27e4cf62de66125a5566df9975155628b400fbfb3a29040ed50faffdbb
+ * PTX 18aece7c5c44693260aab386c0a37b11b114f1c415aebb653be468179428d43a
+ * PTX 4d8bc3ec38813eca30a13cf1bb18d524f1992d44d8b1a42ea30b22e6c95b199d
+ * PTX 8d182f8840b09d059585c31ad691fa0619ff038aca2c39a943421157361717c4
+ * PTX 9d322028a74648113bd8c9d7ec77cf3c89c1ec8718ceff8516d96b34c3c614f1
+ * PTX 0699c9abc4ed0411506223bea16af35c883accdbe1104eef0cfdb54e12fb230a
+ * CTX 3260ae8dad1f4a32c5cafe3ab0eb95549d461a67ceb9e5aa2d3afb62dece0553
+ * CTX 193ba50c75be251e08d1d08f1088576c7efdfaaf3f459559571e12511753b07a
+ * CTX f073f35da06af0ce0bbf6b8f5ccc5cea500ec1b211bd51f63b606bf6528796ca
+ * CTX 12173ba39b8935ee44ccce646f90a45bf9ccc567f0ace13dc2d53ebeedc81f58
+ * CTX b2e41179dddf0d5a5c42f5d8506c1a5d2f8f59f3ea873cbcd0eec19acbf32542
+ * CTX 3bd3dcb8c2b1bf1d1eaed0eba7f0698e4314fbeb2f1566d1b9253008cbccf45a
+ * CTX 2b0d9c5c9c21474f4076e02be26050b99dee4fd68a4cf890e496e4fcae7b70f9
+ * CTX 4ea5a9062da0daeba1993d2ccd1dd3c244b8428801495a58b216547e7e847c46
+ * CTX d1d756377b6242d2e5fb83bf752b54e0df71e889f3a2bb0f4c10805bf3c59037
+ * CTX 6e3c24e22ff57f7fa965577375325cea5d920db94b9c336b455f6e894c01866f
+ * CTX e9fbb8c8d3f70a2957285f6dfb5dcd8cbf54782f8fe7766d4723819913ac7734
+ * CTX 21e3a31095866bad22c86a6036b2518b2059b4229d18c8c2ccbdf906c6cc6e82
+ * CTX 464ee57bddb0bebcb1dc645325bfb3e665ef7251082c88ebb1cf203bd779fdd3
+ * CTX 8675713c8daadd17e1cabee432b09787b6ddf3304e38b731b45df5df51b78fcf
+ * CTX b3d32466028d0ba36555e7e11ab0ee0666061d1645d962444bc47a38188930a8
+ * CTX 4b4d561395c73c087021927ca638b7afc8a8679ccb84c26555440ec7f10445cd
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v9_key1[16] = {
+	0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+	0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v9_key2[16] = {
+	0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+	0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v9_TW[16] = {
+	0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v9_PTX[512] = {
+	0x72, 0xef, 0xc1, 0xeb, 0xfe, 0x1e, 0xe2, 0x59,
+	0x75, 0xa6, 0xeb, 0x3a, 0xa8, 0x58, 0x9d, 0xda,
+	0x2b, 0x26, 0x1f, 0x1c, 0x85, 0xbd, 0xab, 0x44,
+	0x2a, 0x9e, 0x5b, 0x2d, 0xd1, 0xd7, 0xc3, 0x95,
+	0x7a, 0x16, 0xfc, 0x08, 0xe5, 0x26, 0xd4, 0xb1,
+	0x22, 0x3f, 0x1b, 0x12, 0x32, 0xa1, 0x1a, 0xf2,
+	0x74, 0xc3, 0xd7, 0x0d, 0xac, 0x57, 0xf8, 0x3e,
+	0x09, 0x83, 0xc4, 0x98, 0xf1, 0xa6, 0xf1, 0xae,
+	0xcb, 0x02, 0x1c, 0x3e, 0x70, 0x08, 0x5a, 0x1e,
+	0x52, 0x7f, 0x1c, 0xe4, 0x1e, 0xe5, 0x91, 0x1a,
+	0x82, 0x02, 0x01, 0x61, 0x52, 0x9c, 0xd8, 0x27,
+	0x73, 0x76, 0x2d, 0xaf, 0x54, 0x59, 0xde, 0x94,
+	0xa0, 0xa8, 0x2a, 0xda, 0xe7, 0xe1, 0x70, 0x3c,
+	0x80, 0x85, 0x43, 0xc2, 0x9e, 0xd6, 0xfb, 0x32,
+	0xd9, 0xe0, 0x04, 0x32, 0x7c, 0x13, 0x55, 0x18,
+	0x0c, 0x99, 0x5a, 0x07, 0x74, 0x14, 0x93, 0xa0,
+	0x9c, 0x21, 0xba, 0x01, 0xa3, 0x87, 0x88, 0x2d,
+	0xa4, 0xf6, 0x25, 0x34, 0xb8, 0x7b, 0xb1, 0x5d,
+	0x60, 0xd1, 0x97, 0x20, 0x1c, 0x0f, 0xd3, 0xbf,
+	0x30, 0xc1, 0x50, 0x0a, 0x3e, 0xcf, 0xec, 0xdd,
+	0x66, 0xd8, 0x72, 0x1f, 0x90, 0xbc, 0xc4, 0xc1,
+	0x7e, 0xe9, 0x25, 0xc6, 0x1b, 0x0a, 0x03, 0x72,
+	0x7a, 0x9c, 0x0d, 0x5f, 0x5c, 0xa4, 0x62, 0xfb,
+	0xfa, 0x0a, 0xf1, 0xc2, 0x51, 0x3a, 0x9d, 0x9d,
+	0x4b, 0x53, 0x45, 0xbd, 0x27, 0xa5, 0xf6, 0xe6,
+	0x53, 0xf7, 0x51, 0x69, 0x3e, 0x6b, 0x6a, 0x2b,
+	0x8e, 0xad, 0x57, 0xd5, 0x11, 0xe0, 0x0e, 0x58,
+	0xc4, 0x5b, 0x7b, 0x8d, 0x00, 0x5a, 0xf7, 0x92,
+	0x88, 0xf5, 0xc7, 0xc2, 0x2f, 0xd4, 0xf1, 0xbf,
+	0x7a, 0x89, 0x8b, 0x03, 0xa5, 0x63, 0x4c, 0x6a,
+	0x1a, 0xe3, 0xf9, 0xfa, 0xe5, 0xde, 0x4f, 0x29,
+	0x6a, 0x28, 0x96, 0xb2, 0x3e, 0x7e, 0xd4, 0x3e,
+	0xd1, 0x4f, 0xa5, 0xa2, 0x80, 0x3f, 0x4d, 0x28,
+	0xf0, 0xd3, 0xff, 0xcf, 0x24, 0x75, 0x76, 0x77,
+	0xae, 0xbd, 0xb4, 0x7b, 0xb3, 0x88, 0x37, 0x87,
+	0x08, 0x94, 0x8a, 0x8d, 0x41, 0x26, 0xed, 0x18,
+	0x39, 0xe0, 0xda, 0x29, 0xa5, 0x37, 0xa8, 0xc1,
+	0x98, 0xb3, 0xc6, 0x6a, 0xb0, 0x07, 0x12, 0xdd,
+	0x26, 0x16, 0x74, 0xbf, 0x45, 0xa7, 0x3d, 0x67,
+	0xf7, 0x69, 0x14, 0xf8, 0x30, 0xca, 0x01, 0x4b,
+	0x65, 0x59, 0x6f, 0x27, 0xe4, 0xcf, 0x62, 0xde,
+	0x66, 0x12, 0x5a, 0x55, 0x66, 0xdf, 0x99, 0x75,
+	0x15, 0x56, 0x28, 0xb4, 0x00, 0xfb, 0xfb, 0x3a,
+	0x29, 0x04, 0x0e, 0xd5, 0x0f, 0xaf, 0xfd, 0xbb,
+	0x18, 0xae, 0xce, 0x7c, 0x5c, 0x44, 0x69, 0x32,
+	0x60, 0xaa, 0xb3, 0x86, 0xc0, 0xa3, 0x7b, 0x11,
+	0xb1, 0x14, 0xf1, 0xc4, 0x15, 0xae, 0xbb, 0x65,
+	0x3b, 0xe4, 0x68, 0x17, 0x94, 0x28, 0xd4, 0x3a,
+	0x4d, 0x8b, 0xc3, 0xec, 0x38, 0x81, 0x3e, 0xca,
+	0x30, 0xa1, 0x3c, 0xf1, 0xbb, 0x18, 0xd5, 0x24,
+	0xf1, 0x99, 0x2d, 0x44, 0xd8, 0xb1, 0xa4, 0x2e,
+	0xa3, 0x0b, 0x22, 0xe6, 0xc9, 0x5b, 0x19, 0x9d,
+	0x8d, 0x18, 0x2f, 0x88, 0x40, 0xb0, 0x9d, 0x05,
+	0x95, 0x85, 0xc3, 0x1a, 0xd6, 0x91, 0xfa, 0x06,
+	0x19, 0xff, 0x03, 0x8a, 0xca, 0x2c, 0x39, 0xa9,
+	0x43, 0x42, 0x11, 0x57, 0x36, 0x17, 0x17, 0xc4,
+	0x9d, 0x32, 0x20, 0x28, 0xa7, 0x46, 0x48, 0x11,
+	0x3b, 0xd8, 0xc9, 0xd7, 0xec, 0x77, 0xcf, 0x3c,
+	0x89, 0xc1, 0xec, 0x87, 0x18, 0xce, 0xff, 0x85,
+	0x16, 0xd9, 0x6b, 0x34, 0xc3, 0xc6, 0x14, 0xf1,
+	0x06, 0x99, 0xc9, 0xab, 0xc4, 0xed, 0x04, 0x11,
+	0x50, 0x62, 0x23, 0xbe, 0xa1, 0x6a, 0xf3, 0x5c,
+	0x88, 0x3a, 0xcc, 0xdb, 0xe1, 0x10, 0x4e, 0xef,
+	0x0c, 0xfd, 0xb5, 0x4e, 0x12, 0xfb, 0x23, 0x0a
+};
+
+static uint8_t v9_CTX[512] = {
+	0x32, 0x60, 0xae, 0x8d, 0xad, 0x1f, 0x4a, 0x32,
+	0xc5, 0xca, 0xfe, 0x3a, 0xb0, 0xeb, 0x95, 0x54,
+	0x9d, 0x46, 0x1a, 0x67, 0xce, 0xb9, 0xe5, 0xaa,
+	0x2d, 0x3a, 0xfb, 0x62, 0xde, 0xce, 0x05, 0x53,
+	0x19, 0x3b, 0xa5, 0x0c, 0x75, 0xbe, 0x25, 0x1e,
+	0x08, 0xd1, 0xd0, 0x8f, 0x10, 0x88, 0x57, 0x6c,
+	0x7e, 0xfd, 0xfa, 0xaf, 0x3f, 0x45, 0x95, 0x59,
+	0x57, 0x1e, 0x12, 0x51, 0x17, 0x53, 0xb0, 0x7a,
+	0xf0, 0x73, 0xf3, 0x5d, 0xa0, 0x6a, 0xf0, 0xce,
+	0x0b, 0xbf, 0x6b, 0x8f, 0x5c, 0xcc, 0x5c, 0xea,
+	0x50, 0x0e, 0xc1, 0xb2, 0x11, 0xbd, 0x51, 0xf6,
+	0x3b, 0x60, 0x6b, 0xf6, 0x52, 0x87, 0x96, 0xca,
+	0x12, 0x17, 0x3b, 0xa3, 0x9b, 0x89, 0x35, 0xee,
+	0x44, 0xcc, 0xce, 0x64, 0x6f, 0x90, 0xa4, 0x5b,
+	0xf9, 0xcc, 0xc5, 0x67, 0xf0, 0xac, 0xe1, 0x3d,
+	0xc2, 0xd5, 0x3e, 0xbe, 0xed, 0xc8, 0x1f, 0x58,
+	0xb2, 0xe4, 0x11, 0x79, 0xdd, 0xdf, 0x0d, 0x5a,
+	0x5c, 0x42, 0xf5, 0xd8, 0x50, 0x6c, 0x1a, 0x5d,
+	0x2f, 0x8f, 0x59, 0xf3, 0xea, 0x87, 0x3c, 0xbc,
+	0xd0, 0xee, 0xc1, 0x9a, 0xcb, 0xf3, 0x25, 0x42,
+	0x3b, 0xd3, 0xdc, 0xb8, 0xc2, 0xb1, 0xbf, 0x1d,
+	0x1e, 0xae, 0xd0, 0xeb, 0xa7, 0xf0, 0x69, 0x8e,
+	0x43, 0x14, 0xfb, 0xeb, 0x2f, 0x15, 0x66, 0xd1,
+	0xb9, 0x25, 0x30, 0x08, 0xcb, 0xcc, 0xf4, 0x5a,
+	0x2b, 0x0d, 0x9c, 0x5c, 0x9c, 0x21, 0x47, 0x4f,
+	0x40, 0x76, 0xe0, 0x2b, 0xe2, 0x60, 0x50, 0xb9,
+	0x9d, 0xee, 0x4f, 0xd6, 0x8a, 0x4c, 0xf8, 0x90,
+	0xe4, 0x96, 0xe4, 0xfc, 0xae, 0x7b, 0x70, 0xf9,
+	0x4e, 0xa5, 0xa9, 0x06, 0x2d, 0xa0, 0xda, 0xeb,
+	0xa1, 0x99, 0x3d, 0x2c, 0xcd, 0x1d, 0xd3, 0xc2,
+	0x44, 0xb8, 0x42, 0x88, 0x01, 0x49, 0x5a, 0x58,
+	0xb2, 0x16, 0x54, 0x7e, 0x7e, 0x84, 0x7c, 0x46,
+	0xd1, 0xd7, 0x56, 0x37, 0x7b, 0x62, 0x42, 0xd2,
+	0xe5, 0xfb, 0x83, 0xbf, 0x75, 0x2b, 0x54, 0xe0,
+	0xdf, 0x71, 0xe8, 0x89, 0xf3, 0xa2, 0xbb, 0x0f,
+	0x4c, 0x10, 0x80, 0x5b, 0xf3, 0xc5, 0x90, 0x37,
+	0x6e, 0x3c, 0x24, 0xe2, 0x2f, 0xf5, 0x7f, 0x7f,
+	0xa9, 0x65, 0x57, 0x73, 0x75, 0x32, 0x5c, 0xea,
+	0x5d, 0x92, 0x0d, 0xb9, 0x4b, 0x9c, 0x33, 0x6b,
+	0x45, 0x5f, 0x6e, 0x89, 0x4c, 0x01, 0x86, 0x6f,
+	0xe9, 0xfb, 0xb8, 0xc8, 0xd3, 0xf7, 0x0a, 0x29,
+	0x57, 0x28, 0x5f, 0x6d, 0xfb, 0x5d, 0xcd, 0x8c,
+	0xbf, 0x54, 0x78, 0x2f, 0x8f, 0xe7, 0x76, 0x6d,
+	0x47, 0x23, 0x81, 0x99, 0x13, 0xac, 0x77, 0x34,
+	0x21, 0xe3, 0xa3, 0x10, 0x95, 0x86, 0x6b, 0xad,
+	0x22, 0xc8, 0x6a, 0x60, 0x36, 0xb2, 0x51, 0x8b,
+	0x20, 0x59, 0xb4, 0x22, 0x9d, 0x18, 0xc8, 0xc2,
+	0xcc, 0xbd, 0xf9, 0x06, 0xc6, 0xcc, 0x6e, 0x82,
+	0x46, 0x4e, 0xe5, 0x7b, 0xdd, 0xb0, 0xbe, 0xbc,
+	0xb1, 0xdc, 0x64, 0x53, 0x25, 0xbf, 0xb3, 0xe6,
+	0x65, 0xef, 0x72, 0x51, 0x08, 0x2c, 0x88, 0xeb,
+	0xb1, 0xcf, 0x20, 0x3b, 0xd7, 0x79, 0xfd, 0xd3,
+	0x86, 0x75, 0x71, 0x3c, 0x8d, 0xaa, 0xdd, 0x17,
+	0xe1, 0xca, 0xbe, 0xe4, 0x32, 0xb0, 0x97, 0x87,
+	0xb6, 0xdd, 0xf3, 0x30, 0x4e, 0x38, 0xb7, 0x31,
+	0xb4, 0x5d, 0xf5, 0xdf, 0x51, 0xb7, 0x8f, 0xcf,
+	0xb3, 0xd3, 0x24, 0x66, 0x02, 0x8d, 0x0b, 0xa3,
+	0x65, 0x55, 0xe7, 0xe1, 0x1a, 0xb0, 0xee, 0x06,
+	0x66, 0x06, 0x1d, 0x16, 0x45, 0xd9, 0x62, 0x44,
+	0x4b, 0xc4, 0x7a, 0x38, 0x18, 0x89, 0x30, 0xa8,
+	0x4b, 0x4d, 0x56, 0x13, 0x95, 0xc7, 0x3c, 0x08,
+	0x70, 0x21, 0x92, 0x7c, 0xa6, 0x38, 0xb7, 0xaf,
+	0xc8, 0xa8, 0x67, 0x9c, 0xcb, 0x84, 0xc2, 0x65,
+	0x55, 0x44, 0x0e, 0xc7, 0xf1, 0x04, 0x45, 0xcd
+};
+
+/*
+ * Vector 15
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0
+ * Data unit sequence number 9a78563412
+ * PTX 000102030405060708090a0b0c0d0e0f10
+ * CTX 6c1625db4671522d3d7599601de7ca09ed
+ * Plaintext length (bytes): 17
+ */
+
+static uint8_t v15_key1[16] = {
+	0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+	0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v15_key2[16] = {
+	0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
+	0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0
+};
+
+static uint8_t v15_TW[16] = {
+	0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v15_PTX[17] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10
+};
+
+static uint8_t v15_CTX[17] = {
+	0x6c, 0x16, 0x25, 0xdb, 0x46, 0x71, 0x52, 0x2d,
+	0x3d, 0x75, 0x99, 0x60, 0x1d, 0xe7, 0xca, 0x09,
+	0xed
+};
+
+/*
+ * Vector 16
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0
+ * Data unit sequence number 9a78563412
+ * PTX 000102030405060708090a0b0c0d0e0f1011
+ * CTX d069444b7a7e0cab09e24447d24deb1fedbf
+ * Plaintext length (bytes): 18
+ */
+static uint8_t v16_key1[16] = {
+	0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+	0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v16_key2[16] = {
+	0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
+	0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0
+};
+
+static uint8_t v16_TW[16] = {
+	0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v16_PTX[18] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11
+};
+
+static uint8_t v16_CTX[18] = {
+	0xd0, 0x69, 0x44, 0x4b, 0x7a, 0x7e, 0x0c, 0xab,
+	0x09, 0xe2, 0x44, 0x47, 0xd2, 0x4d, 0xeb, 0x1f,
+	0xed, 0xbf
+};
+
+/*
+ * Vector 17
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0
+ * Data unit sequence number 9a78563412
+ * PTX 000102030405060708090a0b0c0d0e0f101112
+ * CTX e5df1351c0544ba1350b3363cd8ef4beedbf9d
+ * Plaintext length (bytes): 19
+ */
+
+static uint8_t v17_key1[16] = {
+	0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+	0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v17_key2[16] = {
+	0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
+	0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0
+};
+
+static uint8_t v17_TW[16] = {
+	0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v17_PTX[19] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12
+};
+
+static uint8_t v17_CTX[19] = {
+	0xe5, 0xdf, 0x13, 0x51, 0xc0, 0x54, 0x4b, 0xa1,
+	0x35, 0x0b, 0x33, 0x63, 0xcd, 0x8e, 0xf4, 0xbe,
+	0xed, 0xbf, 0x9d
+};
+
+/*
+ * Vector 18
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0
+ * Data unit sequence number 9a78563412
+ * PTX 000102030405060708090a0b0c0d0e0f10111213
+ * CTX 9d84c813f719aa2c7be3f66171c7c5c2edbf9dac
+ * Plaintext length (bytes): 20
+ */
+
+static uint8_t v18_key1[16] = {
+	0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+	0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v18_key2[16] = {
+	0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
+	0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0
+};
+
+static uint8_t v18_TW[16] = {
+	0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v18_PTX[20] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12, 0x13
+};
+
+static uint8_t v18_CTX[20] = {
+	0x9d, 0x84, 0xc8, 0x13, 0xf7, 0x19, 0xaa, 0x2c,
+	0x7b, 0xe3, 0xf6, 0x61, 0x71, 0xc7, 0xc5, 0xc2,
+	0xed, 0xbf, 0x9d, 0xac
+};
+
+/*
+ * Vector 19
+ * Key1 e0e1e2e3e4e5e6e7e8e9eaebecedeeef
+ * Key2 c0c1c2c3c4c5c6c7c8c9cacbcccdcecf
+ * Data unit sequence number 21436587a9
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 38b45812ef43a05bd957e545907e223b954ab4aaf088303ad910eadf14b42be6
+ * CTX 8b2461149d8c8ba85f992be970bc621f1b06573f63e867bf5875acafa04e42cc
+ * CTX bd7bd3c2a0fb1fff791ec5ec36c66ae4ac1e806d81fbf709dbe29e471fad3854
+ * CTX 9c8e66f5345d7c1eb94f405d1ec785cc6f6a68f6254dd8339f9d84057e01a177
+ * CTX 41990482999516b5611a38f41bb6478e6f173f320805dd71b1932fc333cb9ee3
+ * CTX 9936beea9ad96fa10fb4112b901734ddad40bc1878995f8e11aee7d141a2f5d4
+ * CTX 8b7a4e1e7f0b2c04830e69a4fd1378411c2f287edf48c6c4e5c247a19680f7fe
+ * CTX 41cefbd49b582106e3616cbbe4dfb2344b2ae9519391f3e0fb4922254b1d6d2d
+ * CTX 19c6d4d537b3a26f3bcc51588b32f3eca0829b6a5ac72578fb814fb43cf80d64
+ * CTX a233e3f997a3f02683342f2b33d25b492536b93becb2f5e1a8b82f5b88334272
+ * CTX 9e8ae09d16938841a21a97fb543eea3bbff59f13c1a18449e398701c1ad51648
+ * CTX 346cbc04c27bb2da3b93a1372ccae548fb53bee476f9e9c91773b1bb19828394
+ * CTX d55d3e1a20ed69113a860b6829ffa847224604435070221b257e8dff783615d2
+ * CTX cae4803a93aa4334ab482a0afac9c0aeda70b45a481df5dec5df8cc0f423c77a
+ * CTX 5fd46cd312021d4b438862419a791be03bb4d97c0e59578542531ba466a83baf
+ * CTX 92cefc151b5cc1611a167893819b63fb8a6b18e86de60290fa72b797b0ce59f3
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v19_key1[16] = {
+
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef
+};
+
+static uint8_t v19_key2[16] = {
+
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf
+};
+
+static uint8_t v19_TW[16] = {
+
+	0x21, 0x43, 0x65, 0x87, 0xa9, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v19_PTX[512] = {
+
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v19_CTX[512] = {
+	0x38, 0xb4, 0x58, 0x12, 0xef, 0x43, 0xa0, 0x5b,
+	0xd9, 0x57, 0xe5, 0x45, 0x90, 0x7e, 0x22, 0x3b,
+	0x95, 0x4a, 0xb4, 0xaa, 0xf0, 0x88, 0x30, 0x3a,
+	0xd9, 0x10, 0xea, 0xdf, 0x14, 0xb4, 0x2b, 0xe6,
+	0x8b, 0x24, 0x61, 0x14, 0x9d, 0x8c, 0x8b, 0xa8,
+	0x5f, 0x99, 0x2b, 0xe9, 0x70, 0xbc, 0x62, 0x1f,
+	0x1b, 0x06, 0x57, 0x3f, 0x63, 0xe8, 0x67, 0xbf,
+	0x58, 0x75, 0xac, 0xaf, 0xa0, 0x4e, 0x42, 0xcc,
+	0xbd, 0x7b, 0xd3, 0xc2, 0xa0, 0xfb, 0x1f, 0xff,
+	0x79, 0x1e, 0xc5, 0xec, 0x36, 0xc6, 0x6a, 0xe4,
+	0xac, 0x1e, 0x80, 0x6d, 0x81, 0xfb, 0xf7, 0x09,
+	0xdb, 0xe2, 0x9e, 0x47, 0x1f, 0xad, 0x38, 0x54,
+	0x9c, 0x8e, 0x66, 0xf5, 0x34, 0x5d, 0x7c, 0x1e,
+	0xb9, 0x4f, 0x40, 0x5d, 0x1e, 0xc7, 0x85, 0xcc,
+	0x6f, 0x6a, 0x68, 0xf6, 0x25, 0x4d, 0xd8, 0x33,
+	0x9f, 0x9d, 0x84, 0x05, 0x7e, 0x01, 0xa1, 0x77,
+	0x41, 0x99, 0x04, 0x82, 0x99, 0x95, 0x16, 0xb5,
+	0x61, 0x1a, 0x38, 0xf4, 0x1b, 0xb6, 0x47, 0x8e,
+	0x6f, 0x17, 0x3f, 0x32, 0x08, 0x05, 0xdd, 0x71,
+	0xb1, 0x93, 0x2f, 0xc3, 0x33, 0xcb, 0x9e, 0xe3,
+	0x99, 0x36, 0xbe, 0xea, 0x9a, 0xd9, 0x6f, 0xa1,
+	0x0f, 0xb4, 0x11, 0x2b, 0x90, 0x17, 0x34, 0xdd,
+	0xad, 0x40, 0xbc, 0x18, 0x78, 0x99, 0x5f, 0x8e,
+	0x11, 0xae, 0xe7, 0xd1, 0x41, 0xa2, 0xf5, 0xd4,
+	0x8b, 0x7a, 0x4e, 0x1e, 0x7f, 0x0b, 0x2c, 0x04,
+	0x83, 0x0e, 0x69, 0xa4, 0xfd, 0x13, 0x78, 0x41,
+	0x1c, 0x2f, 0x28, 0x7e, 0xdf, 0x48, 0xc6, 0xc4,
+	0xe5, 0xc2, 0x47, 0xa1, 0x96, 0x80, 0xf7, 0xfe,
+	0x41, 0xce, 0xfb, 0xd4, 0x9b, 0x58, 0x21, 0x06,
+	0xe3, 0x61, 0x6c, 0xbb, 0xe4, 0xdf, 0xb2, 0x34,
+	0x4b, 0x2a, 0xe9, 0x51, 0x93, 0x91, 0xf3, 0xe0,
+	0xfb, 0x49, 0x22, 0x25, 0x4b, 0x1d, 0x6d, 0x2d,
+	0x19, 0xc6, 0xd4, 0xd5, 0x37, 0xb3, 0xa2, 0x6f,
+	0x3b, 0xcc, 0x51, 0x58, 0x8b, 0x32, 0xf3, 0xec,
+	0xa0, 0x82, 0x9b, 0x6a, 0x5a, 0xc7, 0x25, 0x78,
+	0xfb, 0x81, 0x4f, 0xb4, 0x3c, 0xf8, 0x0d, 0x64,
+	0xa2, 0x33, 0xe3, 0xf9, 0x97, 0xa3, 0xf0, 0x26,
+	0x83, 0x34, 0x2f, 0x2b, 0x33, 0xd2, 0x5b, 0x49,
+	0x25, 0x36, 0xb9, 0x3b, 0xec, 0xb2, 0xf5, 0xe1,
+	0xa8, 0xb8, 0x2f, 0x5b, 0x88, 0x33, 0x42, 0x72,
+	0x9e, 0x8a, 0xe0, 0x9d, 0x16, 0x93, 0x88, 0x41,
+	0xa2, 0x1a, 0x97, 0xfb, 0x54, 0x3e, 0xea, 0x3b,
+	0xbf, 0xf5, 0x9f, 0x13, 0xc1, 0xa1, 0x84, 0x49,
+	0xe3, 0x98, 0x70, 0x1c, 0x1a, 0xd5, 0x16, 0x48,
+	0x34, 0x6c, 0xbc, 0x04, 0xc2, 0x7b, 0xb2, 0xda,
+	0x3b, 0x93, 0xa1, 0x37, 0x2c, 0xca, 0xe5, 0x48,
+	0xfb, 0x53, 0xbe, 0xe4, 0x76, 0xf9, 0xe9, 0xc9,
+	0x17, 0x73, 0xb1, 0xbb, 0x19, 0x82, 0x83, 0x94,
+	0xd5, 0x5d, 0x3e, 0x1a, 0x20, 0xed, 0x69, 0x11,
+	0x3a, 0x86, 0x0b, 0x68, 0x29, 0xff, 0xa8, 0x47,
+	0x22, 0x46, 0x04, 0x43, 0x50, 0x70, 0x22, 0x1b,
+	0x25, 0x7e, 0x8d, 0xff, 0x78, 0x36, 0x15, 0xd2,
+	0xca, 0xe4, 0x80, 0x3a, 0x93, 0xaa, 0x43, 0x34,
+	0xab, 0x48, 0x2a, 0x0a, 0xfa, 0xc9, 0xc0, 0xae,
+	0xda, 0x70, 0xb4, 0x5a, 0x48, 0x1d, 0xf5, 0xde,
+	0xc5, 0xdf, 0x8c, 0xc0, 0xf4, 0x23, 0xc7, 0x7a,
+	0x5f, 0xd4, 0x6c, 0xd3, 0x12, 0x02, 0x1d, 0x4b,
+	0x43, 0x88, 0x62, 0x41, 0x9a, 0x79, 0x1b, 0xe0,
+	0x3b, 0xb4, 0xd9, 0x7c, 0x0e, 0x59, 0x57, 0x85,
+	0x42, 0x53, 0x1b, 0xa4, 0x66, 0xa8, 0x3b, 0xaf,
+	0x92, 0xce, 0xfc, 0x15, 0x1b, 0x5c, 0xc1, 0x61,
+	0x1a, 0x16, 0x78, 0x93, 0x81, 0x9b, 0x63, 0xfb,
+	0x8a, 0x6b, 0x18, 0xe8, 0x6d, 0xe6, 0x02, 0x90,
+	0xfa, 0x72, 0xb7, 0x97, 0xb0, 0xce, 0x59, 0xf3
+};
+
+// Define vector of structs, with pointers to the statically defined vectors
+
+struct xts_vector vlist[NVEC] = {
+
+	// pointers to the statically defined vectors here
+
+	// Vector 1
+	{sizeof(v1_CTX), v1_key1, v1_key2, v1_TW, v1_PTX, v1_CTX}
+	,
+	// Vector 2
+	{sizeof(v2_CTX), v2_key1, v2_key2, v2_TW, v2_PTX, v2_CTX}
+	,
+	// Vector 3
+	{sizeof(v3_CTX), v3_key1, v3_key2, v3_TW, v3_PTX, v3_CTX}
+	,
+	// Vector 4
+	{sizeof(v4_CTX), v4_key1, v4_key2, v4_TW, v4_PTX, v4_CTX}
+	,
+	// Vector 5
+	{sizeof(v5_CTX), v5_key1, v5_key2, v5_TW, v5_PTX, v5_CTX}
+	,
+	// Vector 6
+	{sizeof(v6_CTX), v6_key1, v6_key2, v6_TW, v6_PTX, v6_CTX}
+	,
+	// Vector 7
+	{sizeof(v7_CTX), v7_key1, v7_key2, v7_TW, v7_PTX, v7_CTX}
+	,
+	// Vector 8
+	{sizeof(v8_CTX), v8_key1, v8_key2, v8_TW, v8_PTX, v8_CTX}
+	,
+	// Vector 9
+	{sizeof(v9_CTX), v9_key1, v9_key2, v9_TW, v9_PTX, v9_CTX}
+	,
+	// Vector 15
+	{sizeof(v15_CTX), v15_key1, v15_key2, v15_TW, v15_PTX, v15_CTX}
+	,
+	// Vector 16
+	{sizeof(v16_CTX), v16_key1, v16_key2, v16_TW, v16_PTX, v16_CTX}
+	,
+	// Vector 17
+	{sizeof(v17_CTX), v17_key1, v17_key2, v17_TW, v17_PTX, v17_CTX}
+	,
+	// Vector 18
+	{sizeof(v18_CTX), v18_key1, v18_key2, v18_TW, v18_PTX, v18_CTX}
+	,
+	// Vector 19
+	{sizeof(v19_CTX), v19_key1, v19_key2, v19_TW, v19_PTX, v19_CTX}
+
+};
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c
new file mode 100644
index 000000000..5bccd4a5c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c
@@ -0,0 +1,145 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>		// for rand
+#include <string.h>		// for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+#include <openssl/evp.h>
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+# define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+# define TEST_LEN     (2 * GT_L3_CACHE)
+# define TEST_LOOPS   50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+			 unsigned char *p, int n)
+{
+	int i;
+	for (i = 0; i < 32; i++) {
+		*k1++ = rand();
+		*k2++ = rand();
+	}
+	for (i = 0; i < 16; i++)
+		*t++ = rand();
+
+	for (i = 0; i < n; i++)
+		*p++ = rand();
+
+}
+
+static inline
+    int openssl_aes_256_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+				int len, unsigned char *pt, unsigned char *ct)
+{
+	int outlen, tmplen;
+	if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv))
+		printf("\n ERROR!! \n");
+	if (!EVP_DecryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+		printf("\n ERROR!! \n");
+	if (!EVP_DecryptFinal_ex(ctx, ct + outlen, &tmplen))
+		printf("\n ERROR!! \n");
+
+	return 0;
+}
+
+int main(void)
+{
+	int i;
+
+	unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+	unsigned char *pt, *ct, *dt, *refdt;
+	struct perf start, stop;
+	unsigned char keyssl[64];	/* SSL takes both keys together */
+
+	/* Initialise our cipher context, which can use same input vectors */
+	EVP_CIPHER_CTX *ctx;
+	ctx = EVP_CIPHER_CTX_new();
+
+	printf("aes_xts_256_dec_perf:\n");
+
+	pt = malloc(TEST_LEN);
+	ct = malloc(TEST_LEN);
+	dt = malloc(TEST_LEN);
+	refdt = malloc(TEST_LEN);
+
+	if (NULL == pt || NULL == ct || NULL == dt || NULL == refdt) {
+		printf("malloc of testsize failed\n");
+		return -1;
+	}
+
+	xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+	/* Set up key for the SSL engine */
+	for (i = 0; i < 32; i++) {
+		keyssl[i] = key1[i];
+		keyssl[i + 32] = key2[i];
+	}
+
+	/* Encrypt and compare decrypted output */
+	XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+	XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+	openssl_aes_256_xts_dec(ctx, keyssl, tinit, TEST_LEN, ct, refdt);
+	if (memcmp(dt, refdt, TEST_LEN)) {
+		printf("ISA-L and OpenSSL results don't match\n");
+		return -1;
+	}
+
+	/* Time ISA-L decryption */
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++)
+		XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+	perf_stop(&stop);
+	printf("aes_xts_256_dec" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	/* Time OpenSSL decryption */
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++)
+		openssl_aes_256_xts_dec(ctx, keyssl, tinit, TEST_LEN, ct, refdt);
+	perf_stop(&stop);
+	printf("aes_xts_256_openssl_dec" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	EVP_CIPHER_CTX_free(ctx);
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c
new file mode 100644
index 000000000..ff3d62e93
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c
@@ -0,0 +1,126 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>		// for rand
+#include <string.h>		// for memcmp
+#include "aes_xts.h"
+#include "aes_keyexp.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   3000000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     (2 * GT_L3_CACHE)
+#  define TEST_LOOPS   400
+#  define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+			 unsigned char *p, int n)
+{
+	int i;
+	for (i = 0; i < 32; i++) {
+		*k1++ = rand();
+		*k2++ = rand();
+	}
+	for (i = 0; i < 16; i++)
+		*t++ = rand();
+
+	for (i = 0; i < n; i++)
+		*p++ = rand();
+
+}
+
+int main(void)
+{
+	int i;
+
+	unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+	unsigned char *pt, *ct, *dt;
+	uint8_t expkey1_enc[16 * 15], expkey2_enc[16 * 15];
+	uint8_t expkey1_dec[16 * 15], null_key[16 * 15];
+
+	printf("aes_xts_256_dec_perf:\n");
+	pt = malloc(TEST_LEN);
+	ct = malloc(TEST_LEN);
+	dt = malloc(TEST_LEN);
+
+	if (NULL == pt || NULL == ct || NULL == dt) {
+		printf("malloc of testsize failed\n");
+		return -1;
+	}
+
+	/* Decode perf test */
+
+	xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+	XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+	XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+
+	struct perf start, stop;
+
+	perf_start(&start);
+
+	for (i = 0; i < TEST_LOOPS; i++) {
+		XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+	}
+
+	perf_stop(&stop);
+
+	printf("aes_xts_256_dec" TEST_TYPE_STR ":              ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	/* Expanded keys perf test */
+
+	aes_keyexp_256(key1, expkey1_enc, expkey1_dec);
+	aes_keyexp_256(key2, expkey2_enc, null_key);
+	XTS_AES_256_dec_expanded_key(expkey2_enc, expkey1_dec, tinit, TEST_LEN, ct, pt);
+
+	perf_start(&start);
+
+	for (i = 0; i < TEST_LOOPS; i++) {
+		XTS_AES_256_dec_expanded_key(expkey2_enc, expkey1_dec, tinit, TEST_LEN, ct,
+					     pt);
+	}
+
+	perf_stop(&stop);
+
+	printf("aes_xts_256_dec_expanded_key" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c
new file mode 100644
index 000000000..8d477ca89
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c
@@ -0,0 +1,145 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>		// for rand
+#include <string.h>		// for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+#include <openssl/evp.h>
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+# define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+# define TEST_LEN     (2 * GT_L3_CACHE)
+# define TEST_LOOPS   50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+			 unsigned char *p, int n)
+{
+	int i;
+	for (i = 0; i < 32; i++) {
+		*k1++ = rand();
+		*k2++ = rand();
+	}
+	for (i = 0; i < 16; i++)
+		*t++ = rand();
+
+	for (i = 0; i < n; i++)
+		*p++ = rand();
+
+}
+
+static inline
+    int openssl_aes_256_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+				int len, unsigned char *pt, unsigned char *ct)
+{
+	int outlen, tmplen;
+	if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv))
+		printf("\n ERROR!! \n");
+	if (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+		printf("\n ERROR!! \n");
+	if (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))
+		printf("\n ERROR!! \n");
+
+	return 0;
+}
+
+int main(void)
+{
+	int i;
+	unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+	unsigned char *pt, *ct, *refct;
+	struct perf start, stop;
+	unsigned char keyssl[64];	/* SSL takes both keys together */
+
+	/* Initialise our cipher context, which can use same input vectors */
+	EVP_CIPHER_CTX *ctx;
+	ctx = EVP_CIPHER_CTX_new();
+
+	printf("aes_xts_256_enc_perf:\n");
+
+	pt = malloc(TEST_LEN);
+	ct = malloc(TEST_LEN);
+	refct = malloc(TEST_LEN);
+
+	if (NULL == pt || NULL == ct || NULL == refct) {
+		printf("malloc of testsize failed\n");
+		return -1;
+	}
+
+	xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+	/* Set up key for the SSL engine */
+	for (i = 0; i < 32; i++) {
+		keyssl[i] = key1[i];
+		keyssl[i + 32] = key2[i];
+	}
+
+	/* Encrypt and compare output */
+	XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+	openssl_aes_256_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct);
+	if (memcmp(ct, refct, TEST_LEN)) {
+		printf("ISA-L and OpenSSL results don't match\n");
+		return -1;
+	}
+
+	/* Time ISA-L encryption */
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++)
+		XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+	perf_stop(&stop);
+
+	printf("aes_xts_256_enc" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	/* Time OpenSSL encryption */
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++)
+		openssl_aes_256_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct);
+	perf_stop(&stop);
+
+	printf("aes_xts_256_ossl_enc" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	EVP_CIPHER_CTX_free(ctx);
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c
new file mode 100644
index 000000000..051dd0a0e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c
@@ -0,0 +1,124 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>		// for rand
+#include <string.h>		// for memcmp
+#include "aes_xts.h"
+#include "aes_keyexp.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   3000000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     (2 * GT_L3_CACHE)
+#  define TEST_LOOPS   400
+#  define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+			 unsigned char *p, int n)
+{
+	int i;
+	for (i = 0; i < 32; i++) {
+		*k1++ = rand();
+		*k2++ = rand();
+	}
+	for (i = 0; i < 16; i++)
+		*t++ = rand();
+
+	for (i = 0; i < n; i++)
+		*p++ = rand();
+
+}
+
+int main(void)
+{
+	int i;
+
+	unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+	unsigned char *pt, *ct;
+	uint8_t expkey1_enc[16 * 15], expkey2_enc[16 * 15];
+	uint8_t expkey1_dec[16 * 15], null_key[16 * 15];
+
+	printf("aes_xts_256_enc_perf:\n");
+	pt = malloc(TEST_LEN);
+	ct = malloc(TEST_LEN);
+
+	if (NULL == pt || NULL == ct) {
+		printf("malloc of testsize failed\n");
+		return -1;
+	}
+
+	/* Encode perf test */
+
+	xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+	XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+
+	struct perf start, stop;
+
+	perf_start(&start);
+
+	for (i = 0; i < TEST_LOOPS; i++) {
+		XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+	}
+
+	perf_stop(&stop);
+
+	printf("aes_xts_256_enc" TEST_TYPE_STR ":              ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	/* Expanded keys perf test */
+
+	aes_keyexp_256(key1, expkey1_enc, expkey1_dec);
+	aes_keyexp_256(key2, expkey2_enc, null_key);
+	XTS_AES_256_enc_expanded_key(expkey2_enc, expkey1_enc, tinit, TEST_LEN, pt, ct);
+
+	perf_start(&start);
+
+	for (i = 0; i < TEST_LOOPS; i++) {
+		XTS_AES_256_enc_expanded_key(expkey2_enc, expkey1_enc, tinit, TEST_LEN, pt,
+					     ct);
+	}
+
+	perf_stop(&stop);
+
+	printf("aes_xts_256_enc_expanded_key" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c
new file mode 100644
index 000000000..c8d664a8b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c
@@ -0,0 +1,113 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <aes_keyexp.h>
+#include "xts_256_vect.h"
+
+int main(void)
+{
+
+	// Temporary array for the calculated vectors
+	uint8_t *ct_test;
+	uint8_t *pt_test;
+	// Arrays for expanded keys, null_key is a dummy vector (decrypt key not
+	// needed for the tweak part of the decryption)
+	uint8_t expkey1_enc[16 * 15], expkey2_enc[16 * 15];
+	uint8_t expkey1_dec[16 * 15], null_key[16 * 15];
+
+	int i, j;
+
+	// --- Encryption test ---
+
+	// Loop over the vectors
+	for (i = 0; i < NVEC; i++) {
+
+		// Allocate space for the calculated ciphertext
+		ct_test = malloc(vlist[i].ptlen);
+		if (ct_test == NULL) {
+			printf("Can't allocate ciphertext memory\n");
+			return -1;
+		}
+		// Pre-expand our keys (will only use the encryption ones here)
+		aes_keyexp_256(vlist[i].key1, expkey1_enc, expkey1_dec);
+		aes_keyexp_256(vlist[i].key2, expkey2_enc, null_key);
+
+		XTS_AES_256_enc_expanded_key(expkey2_enc, expkey1_enc, vlist[i].TW,
+					     vlist[i].ptlen, vlist[i].PTX, ct_test);
+
+		// Carry out comparison of the calculated ciphertext with
+		// the reference
+		for (j = 0; j < vlist[i].ptlen; j++) {
+
+			if (ct_test[j] != vlist[i].CTX[j]) {
+				printf("\nXTS_AES_256_enc: Vector %d: ", i + 10);
+				printf("failed at byte %d! \n", j);
+				return -1;
+			}
+		}
+		printf(".");
+	}
+
+	// --- Decryption test ---
+
+	// Loop over the vectors
+	for (i = 0; i < NVEC; i++) {
+
+		// Allocate space for the calculated plaintext
+		pt_test = malloc(vlist[i].ptlen);
+		if (pt_test == NULL) {
+			printf("Can't allocate plaintext memory\n");
+			return -1;
+		}
+		// Pre-expand keys for the decryption
+		aes_keyexp_256(vlist[i].key1, expkey1_enc, expkey1_dec);
+		aes_keyexp_256(vlist[i].key2, expkey2_enc, null_key);
+
+		// Note, encryption key is re-used for the tweak decryption step
+		XTS_AES_256_dec_expanded_key(expkey2_enc, expkey1_dec, vlist[i].TW,
+					     vlist[i].ptlen, vlist[i].CTX, pt_test);
+
+		// Carry out comparison of the calculated ciphertext with
+		// the reference
+		for (j = 0; j < vlist[i].ptlen; j++) {
+
+			if (pt_test[j] != vlist[i].PTX[j]) {
+				printf("\nXTS_AES_256_dec: Vector %d: ", i + 10);
+				printf("failed at byte %d! \n", j);
+				return -1;
+			}
+		}
+		printf(".");
+	}
+	printf("Pass\n");
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c
new file mode 100644
index 000000000..5ad7359cc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c
@@ -0,0 +1,249 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>		// for rand
+#include <string.h>		// for memcmp
+#include <aes_xts.h>
+#include <aes_keyexp.h>
+
+#define TEST_LEN  (1024*1024)
+#define TEST_SIZE (4096)
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+			 unsigned char *p, int n)
+{
+	int i;
+	for (i = 0; i < 32; i++) {
+		*k1++ = rand();
+		*k2++ = rand();
+	}
+	for (i = 0; i < 16; i++)
+		*t++ = rand();
+
+	for (i = 0; i < n; i++)
+		*p++ = rand();
+
+}
+
+int main(void)
+{
+	int t, n;
+
+	unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+	unsigned char *pt, *ct, *dt;
+
+	int align, size, min_size;
+	unsigned char *efence_pt;
+	unsigned char *efence_ct;
+	unsigned char *efence_dt;
+
+	unsigned char *origin_pt;
+	unsigned char *origin_ct;
+	unsigned char *origin_dt;
+
+	unsigned char key1_exp_enc[16 * 15], key1_exp_dec[16 * 15];
+	unsigned char key2_exp_tw[16 * 15];
+	int i;
+	printf("aes_xts_256 enc/dec rand test, %d sets of %d max: ", RANDOMS, TEST_LEN);
+	pt = malloc(TEST_LEN);
+	ct = malloc(TEST_LEN);
+	dt = malloc(TEST_LEN);
+
+	if (NULL == pt || NULL == ct || NULL == dt) {
+		printf("malloc of testsize failed\n");
+		return -1;
+	}
+
+	xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+	XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+	XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+
+	if (memcmp(pt, dt, TEST_LEN)) {
+		printf("fail\n");
+		return -1;
+	}
+	putchar('.');
+
+	// Do tests with random data, keys and message size
+	for (t = 0; t < RANDOMS; t++) {
+		n = rand() % (TEST_LEN);
+		if (n < 17)
+			continue;
+
+		xts256_mk_rand_data(key1, key2, tinit, pt, n);
+		XTS_AES_256_enc(key2, key1, tinit, n, pt, ct);
+		XTS_AES_256_dec(key2, key1, tinit, n, ct, dt);
+
+		if (memcmp(pt, dt, n)) {
+			printf("fail rand %d, size %d\n", t, n);
+			return -1;
+		}
+		putchar('.');
+		fflush(0);
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	align = 1;
+	min_size = 16;
+	for (size = 0; size <= TEST_SIZE - min_size; size += align) {
+
+		// Line up TEST_SIZE from end
+		efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+		efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+		efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+		xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+		XTS_AES_256_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct);
+		XTS_AES_256_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt);
+
+		if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) {
+			printf("efence: fail size %d\n", TEST_SIZE - size);
+			return -1;
+		}
+		putchar('.');
+		fflush(0);
+	}
+
+	origin_pt = malloc(TEST_LEN);
+	origin_ct = malloc(TEST_LEN);
+	origin_dt = malloc(TEST_LEN);
+	if (NULL == origin_pt || NULL == origin_ct || NULL == origin_dt) {
+		printf("malloc of testsize failed\n");
+		return -1;
+	}
+	// For data lengths from 0 to 15 bytes, the functions return without any error
+	// codes, without reading or writing any data.
+	for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) {
+
+		// Line up TEST_SIZE from end
+		efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+		efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+		efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+		xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+		memcpy(efence_ct, efence_pt, TEST_SIZE - size);
+		memcpy(efence_dt, efence_pt, TEST_SIZE - size);
+		memcpy(origin_pt, efence_pt, TEST_SIZE - size);
+		memcpy(origin_ct, efence_ct, TEST_SIZE - size);
+		memcpy(origin_dt, efence_dt, TEST_SIZE - size);
+
+		XTS_AES_256_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct);
+		XTS_AES_256_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt);
+
+		if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) {
+			printf("efence_pt: fail size %d\n", TEST_SIZE - size);
+			return -1;
+		}
+		if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) {
+			printf("efence_ct: fail size %d\n", TEST_SIZE - size);
+			return -1;
+		}
+		if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) {
+			printf("efence_dt: fail size %d\n", TEST_SIZE - size);
+			return -1;
+		}
+		putchar('.');
+		fflush(0);
+	}
+
+	for (i = 0; i < 16 * 15; i++) {
+		key2_exp_tw[i] = rand();
+	}
+
+	for (size = 0; size <= TEST_SIZE - min_size; size += align) {
+
+		// Line up TEST_SIZE from end
+		efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+		efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+		efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+		xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+		aes_keyexp_256(key1, key1_exp_enc, key1_exp_dec);
+
+		XTS_AES_256_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit,
+					     TEST_SIZE - size, efence_pt, efence_ct);
+		XTS_AES_256_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit,
+					     TEST_SIZE - size, efence_ct, efence_dt);
+
+		if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) {
+			printf("efence_expanded_key: fail size %d\n", TEST_SIZE - size);
+			return -1;
+		}
+		putchar('.');
+		fflush(0);
+	}
+
+	// For data lengths from 0 to 15 bytes, the functions return without any error
+	// codes, without reading or writing any data.
+	for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) {
+
+		// Line up TEST_SIZE from end
+		efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+		efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+		efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+		xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+		memcpy(efence_ct, efence_pt, TEST_SIZE - size);
+		memcpy(efence_dt, efence_pt, TEST_SIZE - size);
+		memcpy(origin_pt, efence_pt, TEST_SIZE - size);
+		memcpy(origin_ct, efence_ct, TEST_SIZE - size);
+		memcpy(origin_dt, efence_dt, TEST_SIZE - size);
+
+		aes_keyexp_256(key1, key1_exp_enc, key1_exp_dec);
+
+		XTS_AES_256_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit,
+					     TEST_SIZE - size, efence_pt, efence_ct);
+		XTS_AES_256_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit,
+					     TEST_SIZE - size, efence_ct, efence_dt);
+
+		if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) {
+			printf("efence_expanded_key for pt: fail size %d\n", TEST_SIZE - size);
+			return -1;
+		}
+		if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) {
+			printf("efence_expanded_key for ct: fail size %d\n", TEST_SIZE - size);
+			return -1;
+		}
+		if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) {
+			printf("efence_expanded_key for dt: fail size %d\n", TEST_SIZE - size);
+			return -1;
+		}
+
+		putchar('.');
+		fflush(0);
+	}
+
+	printf("Pass\n");
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c
new file mode 100644
index 000000000..6b25277dc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c
@@ -0,0 +1,273 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aes_xts.h"
+#include <stdlib.h>
+#include <openssl/evp.h>
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#ifndef RANDOMS
+# define RANDOMS  128
+#endif
+#define TEST_LOOPS  128
+#define TEST_LEN    (1024*1024)
+#define LENGTH_SCAN (2*1024)
+
+/* Generates random data for keys, tweak and plaintext */
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+			 unsigned char *p, int n)
+{
+	int i;
+	for (i = 0; i < 32; i++) {
+		*k1++ = rand();
+		*k2++ = rand();
+	}
+	for (i = 0; i < 16; i++)
+		*t++ = rand();
+
+	for (i = 0; i < n; i++)
+		*p++ = rand();
+
+}
+
+/* Wrapper for OpenSSL EVP AES-XTS 256 encryption */
+static inline
+    int openssl_aes_256_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+				int len, unsigned char *pt, unsigned char *ct)
+{
+	int outlen, tmplen;
+	if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv)
+	    || (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+	    || (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))) {
+		printf("\n Error in openssl encoding of %d bytes\n", len);
+		return 1;
+	}
+	return 0;
+}
+
+/* Wrapper for OpenSSL EVP AES-XTS 256 decryption */
+static inline
+    int openssl_aes_256_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+				int len, unsigned char *ct, unsigned char *dt)
+{
+	int outlen, tmplen;
+	if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv)
+	    || (!EVP_DecryptUpdate(ctx, dt, &outlen, (const unsigned char *)ct, len))
+	    || (!EVP_DecryptFinal_ex(ctx, dt + outlen, &tmplen))) {
+		printf("\n Error in openssl decoding of %d bytes\n", len);
+		return 1;
+	}
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+
+	unsigned char key1[32], key2[32], tinit[16];
+	unsigned char *pt, *ct, *dt, *refct, *refdt;
+	unsigned char keyssl[64];	/* SSL takes both keys together */
+	int i, j, k, ret;
+	int seed;
+
+	if (argc == 1)
+		seed = TEST_SEED;
+	else
+		seed = atoi(argv[1]);
+
+	srand(seed);
+	printf("SEED: %d\n", seed);
+
+	/* Initialise our cipher context, which can use same input vectors */
+	EVP_CIPHER_CTX *ctx;
+	ctx = EVP_CIPHER_CTX_new();
+
+	/* Allocate space for input and output buffers */
+	pt = malloc(TEST_LEN);
+	ct = malloc(TEST_LEN);
+	dt = malloc(TEST_LEN);
+	refct = malloc(TEST_LEN);
+	refdt = malloc(TEST_LEN);
+
+	if (NULL == pt || NULL == ct || NULL == dt || NULL == refct || NULL == refdt) {
+		printf("malloc of testsize failed\n");
+		return -1;
+	}
+
+	/**************************** LENGTH SCAN TEST *************************/
+	printf("aes_xts_256_rand_ossl test, %d sets of various length: ", 2 * 1024);
+
+	xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+	/* Set up key for the SSL engine */
+	for (k = 0; k < 32; k++) {
+		keyssl[k] = key1[k];
+		keyssl[k + 32] = key2[k];
+	}
+
+	for (ret = 0, i = 16; ret == 0 && i < LENGTH_SCAN; i++) {
+
+		/* Encrypt using each method */
+		XTS_AES_256_enc(key2, key1, tinit, i, pt, ct);
+		ret |= openssl_aes_256_xts_enc(ctx, keyssl, tinit, i, pt, refct);
+
+		// Compare
+		for (ret = 0, j = 0; j < i && ret == 0; j++) {
+			if (ct[j] != refct[j])
+				ret = 1;
+		}
+		if (ret)
+			printf(" XTS_AES_256_enc size=%d failed at byte %d!\n", i, j);
+
+		/* Decrypt using each method */
+		XTS_AES_256_dec(key2, key1, tinit, i, ct, dt);
+		ret |= openssl_aes_256_xts_dec(ctx, keyssl, tinit, i, refct, refdt);
+
+		for (k = 0, j = 0; j < TEST_LEN && ret == 0; j++) {
+			if (dt[j] != refdt[j])
+				ret = 1;
+		}
+		if (ret)
+			printf(" XTS_AES_256_dec size=%d failed at byte %d!\n", i, j);
+		if (0 == i % (LENGTH_SCAN / 16))
+			printf(".");
+		fflush(0);
+	}
+	if (ret)
+		return -1;
+	printf("Pass\n");
+
+	/**************************** FIXED LENGTH TEST *************************/
+	printf("aes_xts_256_rand_ossl test, %d sets of length %d: ", TEST_LOOPS, TEST_LEN);
+
+	/* Loop over the vectors */
+	for (i = 0; i < TEST_LOOPS; i++) {
+
+		xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+		/* Set up key for the SSL engine */
+		for (k = 0; k < 32; k++) {
+			keyssl[k] = key1[k];
+			keyssl[k + 32] = key2[k];
+		}
+
+		/* Encrypt using each method */
+		XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+		if (openssl_aes_256_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct))
+			return -1;
+
+		// Carry out comparison of the calculated ciphertext with
+		// the reference
+		for (j = 0; j < TEST_LEN; j++) {
+
+			if (ct[j] != refct[j]) {
+				printf("XTS_AES_256_enc failed at byte %d! \n", j);
+				return -1;
+			}
+		}
+
+		/* Decrypt using each method */
+		XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+		if (openssl_aes_256_xts_dec(ctx, keyssl, tinit, TEST_LEN, refct, refdt))
+			return -1;
+
+		for (j = 0; j < TEST_LEN; j++) {
+
+			if (dt[j] != refdt[j]) {
+				printf("XTS_AES_256_dec failed at byte %d! \n", j);
+				return -1;
+			}
+		}
+		if (0 == i % (TEST_LOOPS / 16))
+			printf(".");
+		fflush(0);
+	}
+	printf("Pass\n");
+
+	/**************************** RANDOM LENGTH TEST *************************/
+	printf("aes_xts_256_rand_ossl test, %d sets of random lengths: ", RANDOMS);
+
+	/* Run tests with random size */
+
+	unsigned int rand_len, t;
+
+	for (t = 0; t < RANDOMS; t++) {
+
+		rand_len = rand() % (TEST_LEN);
+		rand_len = rand_len < 16 ? 16 : rand_len;
+		xts256_mk_rand_data(key1, key2, tinit, pt, rand_len);
+
+		/* Set up key for the SSL engine */
+		for (k = 0; k < 32; k++) {
+			keyssl[k] = key1[k];
+			keyssl[k + 32] = key2[k];
+		}
+
+		/* Encrypt using each method */
+		XTS_AES_256_enc(key2, key1, tinit, rand_len, pt, ct);
+		if (openssl_aes_256_xts_enc(ctx, keyssl, tinit, rand_len, pt, refct))
+			return -1;
+
+		/* Carry out comparison of the calculated ciphertext with
+		 * the reference
+		 */
+		for (j = 0; j < rand_len; j++) {
+
+			if (ct[j] != refct[j]) {
+				printf("XTS_AES_256_enc failed at byte %d! \n", j);
+				return -1;
+			}
+		}
+
+		/* Decrypt using each method */
+		XTS_AES_256_dec(key2, key1, tinit, rand_len, ct, dt);
+		if (openssl_aes_256_xts_dec(ctx, keyssl, tinit, rand_len, refct, refdt))
+			return -1;
+
+		for (j = 0; j < rand_len; j++) {
+
+			if (dt[j] != refdt[j]) {
+				printf("XTS_AES_256_dec failed at byte %d! \n", j);
+				return -1;
+			}
+		}
+		if (0 == t % (RANDOMS / 16))
+			printf(".");
+		fflush(0);
+	}
+
+	EVP_CIPHER_CTX_free(ctx);
+
+	printf("Pass\n");
+
+	printf("aes_xts_256_rand_ossl: All tests passed\n");
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c
new file mode 100644
index 000000000..2c961f44f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c
@@ -0,0 +1,105 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "xts_256_vect.h"
+
+int main(void)
+{
+
+	// Temporary array for the calculated vectors
+	uint8_t *ct_test;
+	uint8_t *pt_test;
+
+	int i, j;
+
+	// --- Encryption test ---
+
+	// Loop over the vectors
+	for (i = 0; i < NVEC; i++) {
+
+		// Allocate space for the calculated ciphertext
+		ct_test = malloc(vlist[i].ptlen);
+		if (ct_test == NULL) {
+			fprintf(stderr, "Can't allocate ciphertext memory\n");
+			return -1;
+		}
+
+		XTS_AES_256_enc(vlist[i].key2, vlist[i].key1, vlist[i].TW,
+				vlist[i].ptlen, vlist[i].PTX, ct_test);
+
+		// Carry out comparison of the calculated ciphertext with
+		// the reference
+		for (j = 0; j < vlist[i].ptlen; j++) {
+
+			if (ct_test[j] != vlist[i].CTX[j]) {
+				printf("\nXTS_AES_256_enc: Vector %d: ", i + 10);
+				printf("failed at byte %d! \n", j);
+				return -1;
+			}
+		}
+		printf(".");
+
+		ct_test = NULL;
+	}
+
+	// --- Decryption test ---
+
+	// Loop over the vectors
+	for (i = 0; i < NVEC; i++) {
+
+		// Allocate space for the calculated ciphertext
+		pt_test = malloc(vlist[i].ptlen);
+		if (pt_test == NULL) {
+			fprintf(stderr, "Can't allocate plaintext memory\n");
+			return -1;
+		}
+
+		XTS_AES_256_dec(vlist[i].key2, vlist[i].key1, vlist[i].TW,
+				vlist[i].ptlen, vlist[i].CTX, pt_test);
+
+		// Carry out comparison of the calculated ciphertext with
+		// the reference
+		for (j = 0; j < vlist[i].ptlen; j++) {
+
+			if (pt_test[j] != vlist[i].PTX[j]) {
+				printf("\nXTS_AES_256_dec: Vector %d: ", i + 10);
+				printf("failed at byte %d! \n", j);
+				return -1;
+			}
+		}
+		printf(".");
+
+		pt_test = NULL;
+	}
+	printf("Pass\n");
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h
new file mode 100644
index 000000000..5a893f173
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h
@@ -0,0 +1,1035 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aes_xts.h"
+
+#define NVEC 5
+
+// struct to hold pointers to the key, plaintext and ciphertext vectors
+struct xts_vector {
+	uint64_t ptlen;		// length of our plaintext
+	uint8_t *key1;		// dimension 16 for 128 bit aes
+	uint8_t *key2;		// dimension 16 for 128 bit aes
+	uint8_t *TW;		// dimension 16 for both 128 and 256 bit
+	uint8_t *PTX;		// min. dimension 16
+	uint8_t *CTX;		// same dimension as PTX
+};
+
+/* Define our test vectors statically here. Test vectors are from the standard:
+ * "IEEE Standard for Cryptographic Protection of Data on Block-Oriented
+ * Storage Devices"
+ * http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4493450
+ *
+ * Vector 10
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 1c3b3a102f770386e4836c99e370cf9bea00803f5e482357a4ae12d414a3e63b
+ * CTX 5d31e276f8fe4a8d66b317f9ac683f44680a86ac35adfc3345befecb4bb188fd
+ * CTX 5776926c49a3095eb108fd1098baec70aaa66999a72a82f27d848b21d4a741b0
+ * CTX c5cd4d5fff9dac89aeba122961d03a757123e9870f8acf1000020887891429ca
+ * CTX 2a3e7a7d7df7b10355165c8b9a6d0a7de8b062c4500dc4cd120c0f7418dae3d0
+ * CTX b5781c34803fa75421c790dfe1de1834f280d7667b327f6c8cd7557e12ac3a0f
+ * CTX 93ec05c52e0493ef31a12d3d9260f79a289d6a379bc70c50841473d1a8cc81ec
+ * CTX 583e9645e07b8d9670655ba5bbcfecc6dc3966380ad8fecb17b6ba02469a020a
+ * CTX 84e18e8f84252070c13e9f1f289be54fbc481457778f616015e1327a02b140f1
+ * CTX 505eb309326d68378f8374595c849d84f4c333ec4423885143cb47bd71c5edae
+ * CTX 9be69a2ffeceb1bec9de244fbe15992b11b77c040f12bd8f6a975a44a0f90c29
+ * CTX a9abc3d4d893927284c58754cce294529f8614dcd2aba991925fedc4ae74ffac
+ * CTX 6e333b93eb4aff0479da9a410e4450e0dd7ae4c6e2910900575da401fc07059f
+ * CTX 645e8b7e9bfdef33943054ff84011493c27b3429eaedb4ed5376441a77ed4385
+ * CTX 1ad77f16f541dfd269d50d6a5f14fb0aab1cbb4c1550be97f7ab4066193c4caa
+ * CTX 773dad38014bd2092fa755c824bb5e54c4f36ffda9fcea70b9c6e693e148c151
+ * Plaintext length (bytes): 512
+ */
+
+static uint8_t v10_key1[32] = {
+	0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+	0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+	0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+	0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v10_key2[32] = {
+	0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+	0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+	0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+	0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v10_TW[16] = {
+	0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v10_PTX[512] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v10_CTX[512] = {
+	0x1c, 0x3b, 0x3a, 0x10, 0x2f, 0x77, 0x03, 0x86,
+	0xe4, 0x83, 0x6c, 0x99, 0xe3, 0x70, 0xcf, 0x9b,
+	0xea, 0x00, 0x80, 0x3f, 0x5e, 0x48, 0x23, 0x57,
+	0xa4, 0xae, 0x12, 0xd4, 0x14, 0xa3, 0xe6, 0x3b,
+	0x5d, 0x31, 0xe2, 0x76, 0xf8, 0xfe, 0x4a, 0x8d,
+	0x66, 0xb3, 0x17, 0xf9, 0xac, 0x68, 0x3f, 0x44,
+	0x68, 0x0a, 0x86, 0xac, 0x35, 0xad, 0xfc, 0x33,
+	0x45, 0xbe, 0xfe, 0xcb, 0x4b, 0xb1, 0x88, 0xfd,
+	0x57, 0x76, 0x92, 0x6c, 0x49, 0xa3, 0x09, 0x5e,
+	0xb1, 0x08, 0xfd, 0x10, 0x98, 0xba, 0xec, 0x70,
+	0xaa, 0xa6, 0x69, 0x99, 0xa7, 0x2a, 0x82, 0xf2,
+	0x7d, 0x84, 0x8b, 0x21, 0xd4, 0xa7, 0x41, 0xb0,
+	0xc5, 0xcd, 0x4d, 0x5f, 0xff, 0x9d, 0xac, 0x89,
+	0xae, 0xba, 0x12, 0x29, 0x61, 0xd0, 0x3a, 0x75,
+	0x71, 0x23, 0xe9, 0x87, 0x0f, 0x8a, 0xcf, 0x10,
+	0x00, 0x02, 0x08, 0x87, 0x89, 0x14, 0x29, 0xca,
+	0x2a, 0x3e, 0x7a, 0x7d, 0x7d, 0xf7, 0xb1, 0x03,
+	0x55, 0x16, 0x5c, 0x8b, 0x9a, 0x6d, 0x0a, 0x7d,
+	0xe8, 0xb0, 0x62, 0xc4, 0x50, 0x0d, 0xc4, 0xcd,
+	0x12, 0x0c, 0x0f, 0x74, 0x18, 0xda, 0xe3, 0xd0,
+	0xb5, 0x78, 0x1c, 0x34, 0x80, 0x3f, 0xa7, 0x54,
+	0x21, 0xc7, 0x90, 0xdf, 0xe1, 0xde, 0x18, 0x34,
+	0xf2, 0x80, 0xd7, 0x66, 0x7b, 0x32, 0x7f, 0x6c,
+	0x8c, 0xd7, 0x55, 0x7e, 0x12, 0xac, 0x3a, 0x0f,
+	0x93, 0xec, 0x05, 0xc5, 0x2e, 0x04, 0x93, 0xef,
+	0x31, 0xa1, 0x2d, 0x3d, 0x92, 0x60, 0xf7, 0x9a,
+	0x28, 0x9d, 0x6a, 0x37, 0x9b, 0xc7, 0x0c, 0x50,
+	0x84, 0x14, 0x73, 0xd1, 0xa8, 0xcc, 0x81, 0xec,
+	0x58, 0x3e, 0x96, 0x45, 0xe0, 0x7b, 0x8d, 0x96,
+	0x70, 0x65, 0x5b, 0xa5, 0xbb, 0xcf, 0xec, 0xc6,
+	0xdc, 0x39, 0x66, 0x38, 0x0a, 0xd8, 0xfe, 0xcb,
+	0x17, 0xb6, 0xba, 0x02, 0x46, 0x9a, 0x02, 0x0a,
+	0x84, 0xe1, 0x8e, 0x8f, 0x84, 0x25, 0x20, 0x70,
+	0xc1, 0x3e, 0x9f, 0x1f, 0x28, 0x9b, 0xe5, 0x4f,
+	0xbc, 0x48, 0x14, 0x57, 0x77, 0x8f, 0x61, 0x60,
+	0x15, 0xe1, 0x32, 0x7a, 0x02, 0xb1, 0x40, 0xf1,
+	0x50, 0x5e, 0xb3, 0x09, 0x32, 0x6d, 0x68, 0x37,
+	0x8f, 0x83, 0x74, 0x59, 0x5c, 0x84, 0x9d, 0x84,
+	0xf4, 0xc3, 0x33, 0xec, 0x44, 0x23, 0x88, 0x51,
+	0x43, 0xcb, 0x47, 0xbd, 0x71, 0xc5, 0xed, 0xae,
+	0x9b, 0xe6, 0x9a, 0x2f, 0xfe, 0xce, 0xb1, 0xbe,
+	0xc9, 0xde, 0x24, 0x4f, 0xbe, 0x15, 0x99, 0x2b,
+	0x11, 0xb7, 0x7c, 0x04, 0x0f, 0x12, 0xbd, 0x8f,
+	0x6a, 0x97, 0x5a, 0x44, 0xa0, 0xf9, 0x0c, 0x29,
+	0xa9, 0xab, 0xc3, 0xd4, 0xd8, 0x93, 0x92, 0x72,
+	0x84, 0xc5, 0x87, 0x54, 0xcc, 0xe2, 0x94, 0x52,
+	0x9f, 0x86, 0x14, 0xdc, 0xd2, 0xab, 0xa9, 0x91,
+	0x92, 0x5f, 0xed, 0xc4, 0xae, 0x74, 0xff, 0xac,
+	0x6e, 0x33, 0x3b, 0x93, 0xeb, 0x4a, 0xff, 0x04,
+	0x79, 0xda, 0x9a, 0x41, 0x0e, 0x44, 0x50, 0xe0,
+	0xdd, 0x7a, 0xe4, 0xc6, 0xe2, 0x91, 0x09, 0x00,
+	0x57, 0x5d, 0xa4, 0x01, 0xfc, 0x07, 0x05, 0x9f,
+	0x64, 0x5e, 0x8b, 0x7e, 0x9b, 0xfd, 0xef, 0x33,
+	0x94, 0x30, 0x54, 0xff, 0x84, 0x01, 0x14, 0x93,
+	0xc2, 0x7b, 0x34, 0x29, 0xea, 0xed, 0xb4, 0xed,
+	0x53, 0x76, 0x44, 0x1a, 0x77, 0xed, 0x43, 0x85,
+	0x1a, 0xd7, 0x7f, 0x16, 0xf5, 0x41, 0xdf, 0xd2,
+	0x69, 0xd5, 0x0d, 0x6a, 0x5f, 0x14, 0xfb, 0x0a,
+	0xab, 0x1c, 0xbb, 0x4c, 0x15, 0x50, 0xbe, 0x97,
+	0xf7, 0xab, 0x40, 0x66, 0x19, 0x3c, 0x4c, 0xaa,
+	0x77, 0x3d, 0xad, 0x38, 0x01, 0x4b, 0xd2, 0x09,
+	0x2f, 0xa7, 0x55, 0xc8, 0x24, 0xbb, 0x5e, 0x54,
+	0xc4, 0xf3, 0x6f, 0xfd, 0xa9, 0xfc, 0xea, 0x70,
+	0xb9, 0xc6, 0xe6, 0x93, 0xe1, 0x48, 0xc1, 0x51
+};
+
+/*
+ * Vector 11
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ffff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 77a31251618a15e6b92d1d66dffe7b50b50bad552305ba0217a610688eff7e11
+ * CTX e1d0225438e093242d6db274fde801d4cae06f2092c728b2478559df58e837c2
+ * CTX 469ee4a4fa794e4bbc7f39bc026e3cb72c33b0888f25b4acf56a2a9804f1ce6d
+ * CTX 3d6e1dc6ca181d4b546179d55544aa7760c40d06741539c7e3cd9d2f6650b201
+ * CTX 3fd0eeb8c2b8e3d8d240ccae2d4c98320a7442e1c8d75a42d6e6cfa4c2eca179
+ * CTX 8d158c7aecdf82490f24bb9b38e108bcda12c3faf9a21141c3613b58367f922a
+ * CTX aa26cd22f23d708dae699ad7cb40a8ad0b6e2784973dcb605684c08b8d6998c6
+ * CTX 9aac049921871ebb65301a4619ca80ecb485a31d744223ce8ddc2394828d6a80
+ * CTX 470c092f5ba413c3378fa6054255c6f9df4495862bbb3287681f931b687c888a
+ * CTX bf844dfc8fc28331e579928cd12bd2390ae123cf03818d14dedde5c0c24c8ab0
+ * CTX 18bfca75ca096f2d531f3d1619e785f1ada437cab92e980558b3dce1474afb75
+ * CTX bfedbf8ff54cb2618e0244c9ac0d3c66fb51598cd2db11f9be39791abe447c63
+ * CTX 094f7c453b7ff87cb5bb36b7c79efb0872d17058b83b15ab0866ad8a58656c5a
+ * CTX 7e20dbdf308b2461d97c0ec0024a2715055249cf3b478ddd4740de654f75ca68
+ * CTX 6e0d7345c69ed50cdc2a8b332b1f8824108ac937eb050585608ee734097fc090
+ * CTX 54fbff89eeaeea791f4a7ab1f9868294a4f9e27b42af8100cb9d59cef9645803
+ * Plaintext length (bytes): 512
+ *
+*/
+static uint8_t v11_key1[32] = {
+	0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+	0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+	0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+	0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v11_key2[32] = {
+	0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+	0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+	0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+	0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v11_TW[16] = {
+	0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v11_PTX[512] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v11_CTX[512] = {
+	0x77, 0xa3, 0x12, 0x51, 0x61, 0x8a, 0x15, 0xe6,
+	0xb9, 0x2d, 0x1d, 0x66, 0xdf, 0xfe, 0x7b, 0x50,
+	0xb5, 0x0b, 0xad, 0x55, 0x23, 0x05, 0xba, 0x02,
+	0x17, 0xa6, 0x10, 0x68, 0x8e, 0xff, 0x7e, 0x11,
+	0xe1, 0xd0, 0x22, 0x54, 0x38, 0xe0, 0x93, 0x24,
+	0x2d, 0x6d, 0xb2, 0x74, 0xfd, 0xe8, 0x01, 0xd4,
+	0xca, 0xe0, 0x6f, 0x20, 0x92, 0xc7, 0x28, 0xb2,
+	0x47, 0x85, 0x59, 0xdf, 0x58, 0xe8, 0x37, 0xc2,
+	0x46, 0x9e, 0xe4, 0xa4, 0xfa, 0x79, 0x4e, 0x4b,
+	0xbc, 0x7f, 0x39, 0xbc, 0x02, 0x6e, 0x3c, 0xb7,
+	0x2c, 0x33, 0xb0, 0x88, 0x8f, 0x25, 0xb4, 0xac,
+	0xf5, 0x6a, 0x2a, 0x98, 0x04, 0xf1, 0xce, 0x6d,
+	0x3d, 0x6e, 0x1d, 0xc6, 0xca, 0x18, 0x1d, 0x4b,
+	0x54, 0x61, 0x79, 0xd5, 0x55, 0x44, 0xaa, 0x77,
+	0x60, 0xc4, 0x0d, 0x06, 0x74, 0x15, 0x39, 0xc7,
+	0xe3, 0xcd, 0x9d, 0x2f, 0x66, 0x50, 0xb2, 0x01,
+	0x3f, 0xd0, 0xee, 0xb8, 0xc2, 0xb8, 0xe3, 0xd8,
+	0xd2, 0x40, 0xcc, 0xae, 0x2d, 0x4c, 0x98, 0x32,
+	0x0a, 0x74, 0x42, 0xe1, 0xc8, 0xd7, 0x5a, 0x42,
+	0xd6, 0xe6, 0xcf, 0xa4, 0xc2, 0xec, 0xa1, 0x79,
+	0x8d, 0x15, 0x8c, 0x7a, 0xec, 0xdf, 0x82, 0x49,
+	0x0f, 0x24, 0xbb, 0x9b, 0x38, 0xe1, 0x08, 0xbc,
+	0xda, 0x12, 0xc3, 0xfa, 0xf9, 0xa2, 0x11, 0x41,
+	0xc3, 0x61, 0x3b, 0x58, 0x36, 0x7f, 0x92, 0x2a,
+	0xaa, 0x26, 0xcd, 0x22, 0xf2, 0x3d, 0x70, 0x8d,
+	0xae, 0x69, 0x9a, 0xd7, 0xcb, 0x40, 0xa8, 0xad,
+	0x0b, 0x6e, 0x27, 0x84, 0x97, 0x3d, 0xcb, 0x60,
+	0x56, 0x84, 0xc0, 0x8b, 0x8d, 0x69, 0x98, 0xc6,
+	0x9a, 0xac, 0x04, 0x99, 0x21, 0x87, 0x1e, 0xbb,
+	0x65, 0x30, 0x1a, 0x46, 0x19, 0xca, 0x80, 0xec,
+	0xb4, 0x85, 0xa3, 0x1d, 0x74, 0x42, 0x23, 0xce,
+	0x8d, 0xdc, 0x23, 0x94, 0x82, 0x8d, 0x6a, 0x80,
+	0x47, 0x0c, 0x09, 0x2f, 0x5b, 0xa4, 0x13, 0xc3,
+	0x37, 0x8f, 0xa6, 0x05, 0x42, 0x55, 0xc6, 0xf9,
+	0xdf, 0x44, 0x95, 0x86, 0x2b, 0xbb, 0x32, 0x87,
+	0x68, 0x1f, 0x93, 0x1b, 0x68, 0x7c, 0x88, 0x8a,
+	0xbf, 0x84, 0x4d, 0xfc, 0x8f, 0xc2, 0x83, 0x31,
+	0xe5, 0x79, 0x92, 0x8c, 0xd1, 0x2b, 0xd2, 0x39,
+	0x0a, 0xe1, 0x23, 0xcf, 0x03, 0x81, 0x8d, 0x14,
+	0xde, 0xdd, 0xe5, 0xc0, 0xc2, 0x4c, 0x8a, 0xb0,
+	0x18, 0xbf, 0xca, 0x75, 0xca, 0x09, 0x6f, 0x2d,
+	0x53, 0x1f, 0x3d, 0x16, 0x19, 0xe7, 0x85, 0xf1,
+	0xad, 0xa4, 0x37, 0xca, 0xb9, 0x2e, 0x98, 0x05,
+	0x58, 0xb3, 0xdc, 0xe1, 0x47, 0x4a, 0xfb, 0x75,
+	0xbf, 0xed, 0xbf, 0x8f, 0xf5, 0x4c, 0xb2, 0x61,
+	0x8e, 0x02, 0x44, 0xc9, 0xac, 0x0d, 0x3c, 0x66,
+	0xfb, 0x51, 0x59, 0x8c, 0xd2, 0xdb, 0x11, 0xf9,
+	0xbe, 0x39, 0x79, 0x1a, 0xbe, 0x44, 0x7c, 0x63,
+	0x09, 0x4f, 0x7c, 0x45, 0x3b, 0x7f, 0xf8, 0x7c,
+	0xb5, 0xbb, 0x36, 0xb7, 0xc7, 0x9e, 0xfb, 0x08,
+	0x72, 0xd1, 0x70, 0x58, 0xb8, 0x3b, 0x15, 0xab,
+	0x08, 0x66, 0xad, 0x8a, 0x58, 0x65, 0x6c, 0x5a,
+	0x7e, 0x20, 0xdb, 0xdf, 0x30, 0x8b, 0x24, 0x61,
+	0xd9, 0x7c, 0x0e, 0xc0, 0x02, 0x4a, 0x27, 0x15,
+	0x05, 0x52, 0x49, 0xcf, 0x3b, 0x47, 0x8d, 0xdd,
+	0x47, 0x40, 0xde, 0x65, 0x4f, 0x75, 0xca, 0x68,
+	0x6e, 0x0d, 0x73, 0x45, 0xc6, 0x9e, 0xd5, 0x0c,
+	0xdc, 0x2a, 0x8b, 0x33, 0x2b, 0x1f, 0x88, 0x24,
+	0x10, 0x8a, 0xc9, 0x37, 0xeb, 0x05, 0x05, 0x85,
+	0x60, 0x8e, 0xe7, 0x34, 0x09, 0x7f, 0xc0, 0x90,
+	0x54, 0xfb, 0xff, 0x89, 0xee, 0xae, 0xea, 0x79,
+	0x1f, 0x4a, 0x7a, 0xb1, 0xf9, 0x86, 0x82, 0x94,
+	0xa4, 0xf9, 0xe2, 0x7b, 0x42, 0xaf, 0x81, 0x00,
+	0xcb, 0x9d, 0x59, 0xce, 0xf9, 0x64, 0x58, 0x03
+};
+
+/*
+ * Vector 12
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ffffff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX e387aaa58ba483afa7e8eb469778317ecf4cf573aa9d4eac23f2cdf914e4e200
+ * CTX a8b490e42ee646802dc6ee2b471b278195d60918ececb44bf79966f83faba049
+ * CTX 9298ebc699c0c8634715a320bb4f075d622e74c8c932004f25b41e361025b5a8
+ * CTX 7815391f6108fc4afa6a05d9303c6ba68a128a55705d415985832fdeaae6c8e1
+ * CTX 9110e84d1b1f199a2692119edc96132658f09da7c623efcec712537a3d94c0bf
+ * CTX 5d7e352ec94ae5797fdb377dc1551150721adf15bd26a8efc2fcaad56881fa9e
+ * CTX 62462c28f30ae1ceaca93c345cf243b73f542e2074a705bd2643bb9f7cc79bb6
+ * CTX e7091ea6e232df0f9ad0d6cf502327876d82207abf2115cdacf6d5a48f6c1879
+ * CTX a65b115f0f8b3cb3c59d15dd8c769bc014795a1837f3901b5845eb491adfefe0
+ * CTX 97b1fa30a12fc1f65ba22905031539971a10f2f36c321bb51331cdefb39e3964
+ * CTX c7ef079994f5b69b2edd83a71ef549971ee93f44eac3938fcdd61d01fa71799d
+ * CTX a3a8091c4c48aa9ed263ff0749df95d44fef6a0bb578ec69456aa5408ae32c7a
+ * CTX f08ad7ba8921287e3bbee31b767be06a0e705c864a769137df28292283ea81a2
+ * CTX 480241b44d9921cdbec1bc28dc1fda114bd8e5217ac9d8ebafa720e9da4f9ace
+ * CTX 231cc949e5b96fe76ffc21063fddc83a6b8679c00d35e09576a875305bed5f36
+ * CTX ed242c8900dd1fa965bc950dfce09b132263a1eef52dd6888c309f5a7d712826
+ * Plaintext length (bytes): 512
+*/
+
+static uint8_t v12_key1[32] = {
+	0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+	0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+	0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+	0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v12_key2[32] = {
+	0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+	0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+	0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+	0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v12_TW[16] = {
+	0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v12_PTX[512] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v12_CTX[512] = {
+	0xe3, 0x87, 0xaa, 0xa5, 0x8b, 0xa4, 0x83, 0xaf,
+	0xa7, 0xe8, 0xeb, 0x46, 0x97, 0x78, 0x31, 0x7e,
+	0xcf, 0x4c, 0xf5, 0x73, 0xaa, 0x9d, 0x4e, 0xac,
+	0x23, 0xf2, 0xcd, 0xf9, 0x14, 0xe4, 0xe2, 0x00,
+	0xa8, 0xb4, 0x90, 0xe4, 0x2e, 0xe6, 0x46, 0x80,
+	0x2d, 0xc6, 0xee, 0x2b, 0x47, 0x1b, 0x27, 0x81,
+	0x95, 0xd6, 0x09, 0x18, 0xec, 0xec, 0xb4, 0x4b,
+	0xf7, 0x99, 0x66, 0xf8, 0x3f, 0xab, 0xa0, 0x49,
+	0x92, 0x98, 0xeb, 0xc6, 0x99, 0xc0, 0xc8, 0x63,
+	0x47, 0x15, 0xa3, 0x20, 0xbb, 0x4f, 0x07, 0x5d,
+	0x62, 0x2e, 0x74, 0xc8, 0xc9, 0x32, 0x00, 0x4f,
+	0x25, 0xb4, 0x1e, 0x36, 0x10, 0x25, 0xb5, 0xa8,
+	0x78, 0x15, 0x39, 0x1f, 0x61, 0x08, 0xfc, 0x4a,
+	0xfa, 0x6a, 0x05, 0xd9, 0x30, 0x3c, 0x6b, 0xa6,
+	0x8a, 0x12, 0x8a, 0x55, 0x70, 0x5d, 0x41, 0x59,
+	0x85, 0x83, 0x2f, 0xde, 0xaa, 0xe6, 0xc8, 0xe1,
+	0x91, 0x10, 0xe8, 0x4d, 0x1b, 0x1f, 0x19, 0x9a,
+	0x26, 0x92, 0x11, 0x9e, 0xdc, 0x96, 0x13, 0x26,
+	0x58, 0xf0, 0x9d, 0xa7, 0xc6, 0x23, 0xef, 0xce,
+	0xc7, 0x12, 0x53, 0x7a, 0x3d, 0x94, 0xc0, 0xbf,
+	0x5d, 0x7e, 0x35, 0x2e, 0xc9, 0x4a, 0xe5, 0x79,
+	0x7f, 0xdb, 0x37, 0x7d, 0xc1, 0x55, 0x11, 0x50,
+	0x72, 0x1a, 0xdf, 0x15, 0xbd, 0x26, 0xa8, 0xef,
+	0xc2, 0xfc, 0xaa, 0xd5, 0x68, 0x81, 0xfa, 0x9e,
+	0x62, 0x46, 0x2c, 0x28, 0xf3, 0x0a, 0xe1, 0xce,
+	0xac, 0xa9, 0x3c, 0x34, 0x5c, 0xf2, 0x43, 0xb7,
+	0x3f, 0x54, 0x2e, 0x20, 0x74, 0xa7, 0x05, 0xbd,
+	0x26, 0x43, 0xbb, 0x9f, 0x7c, 0xc7, 0x9b, 0xb6,
+	0xe7, 0x09, 0x1e, 0xa6, 0xe2, 0x32, 0xdf, 0x0f,
+	0x9a, 0xd0, 0xd6, 0xcf, 0x50, 0x23, 0x27, 0x87,
+	0x6d, 0x82, 0x20, 0x7a, 0xbf, 0x21, 0x15, 0xcd,
+	0xac, 0xf6, 0xd5, 0xa4, 0x8f, 0x6c, 0x18, 0x79,
+	0xa6, 0x5b, 0x11, 0x5f, 0x0f, 0x8b, 0x3c, 0xb3,
+	0xc5, 0x9d, 0x15, 0xdd, 0x8c, 0x76, 0x9b, 0xc0,
+	0x14, 0x79, 0x5a, 0x18, 0x37, 0xf3, 0x90, 0x1b,
+	0x58, 0x45, 0xeb, 0x49, 0x1a, 0xdf, 0xef, 0xe0,
+	0x97, 0xb1, 0xfa, 0x30, 0xa1, 0x2f, 0xc1, 0xf6,
+	0x5b, 0xa2, 0x29, 0x05, 0x03, 0x15, 0x39, 0x97,
+	0x1a, 0x10, 0xf2, 0xf3, 0x6c, 0x32, 0x1b, 0xb5,
+	0x13, 0x31, 0xcd, 0xef, 0xb3, 0x9e, 0x39, 0x64,
+	0xc7, 0xef, 0x07, 0x99, 0x94, 0xf5, 0xb6, 0x9b,
+	0x2e, 0xdd, 0x83, 0xa7, 0x1e, 0xf5, 0x49, 0x97,
+	0x1e, 0xe9, 0x3f, 0x44, 0xea, 0xc3, 0x93, 0x8f,
+	0xcd, 0xd6, 0x1d, 0x01, 0xfa, 0x71, 0x79, 0x9d,
+	0xa3, 0xa8, 0x09, 0x1c, 0x4c, 0x48, 0xaa, 0x9e,
+	0xd2, 0x63, 0xff, 0x07, 0x49, 0xdf, 0x95, 0xd4,
+	0x4f, 0xef, 0x6a, 0x0b, 0xb5, 0x78, 0xec, 0x69,
+	0x45, 0x6a, 0xa5, 0x40, 0x8a, 0xe3, 0x2c, 0x7a,
+	0xf0, 0x8a, 0xd7, 0xba, 0x89, 0x21, 0x28, 0x7e,
+	0x3b, 0xbe, 0xe3, 0x1b, 0x76, 0x7b, 0xe0, 0x6a,
+	0x0e, 0x70, 0x5c, 0x86, 0x4a, 0x76, 0x91, 0x37,
+	0xdf, 0x28, 0x29, 0x22, 0x83, 0xea, 0x81, 0xa2,
+	0x48, 0x02, 0x41, 0xb4, 0x4d, 0x99, 0x21, 0xcd,
+	0xbe, 0xc1, 0xbc, 0x28, 0xdc, 0x1f, 0xda, 0x11,
+	0x4b, 0xd8, 0xe5, 0x21, 0x7a, 0xc9, 0xd8, 0xeb,
+	0xaf, 0xa7, 0x20, 0xe9, 0xda, 0x4f, 0x9a, 0xce,
+	0x23, 0x1c, 0xc9, 0x49, 0xe5, 0xb9, 0x6f, 0xe7,
+	0x6f, 0xfc, 0x21, 0x06, 0x3f, 0xdd, 0xc8, 0x3a,
+	0x6b, 0x86, 0x79, 0xc0, 0x0d, 0x35, 0xe0, 0x95,
+	0x76, 0xa8, 0x75, 0x30, 0x5b, 0xed, 0x5f, 0x36,
+	0xed, 0x24, 0x2c, 0x89, 0x00, 0xdd, 0x1f, 0xa9,
+	0x65, 0xbc, 0x95, 0x0d, 0xfc, 0xe0, 0x9b, 0x13,
+	0x22, 0x63, 0xa1, 0xee, 0xf5, 0x2d, 0xd6, 0x88,
+	0x8c, 0x30, 0x9f, 0x5a, 0x7d, 0x71, 0x28, 0x26
+};
+
+/*
+ * Vector 13
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ffffffff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX bf53d2dade78e822a4d949a9bc6766b01b06a8ef70d26748c6a7fc36d80ae4c5
+ * CTX 520f7c4ab0ac8544424fa405162fef5a6b7f229498063618d39f0003cb5fb8d1
+ * CTX c86b643497da1ff945c8d3bedeca4f479702a7a735f043ddb1d6aaade3c4a0ac
+ * CTX 7ca7f3fa5279bef56f82cd7a2f38672e824814e10700300a055e1630b8f1cb0e
+ * CTX 919f5e942010a416e2bf48cb46993d3cb6a51c19bacf864785a00bc2ecff15d3
+ * CTX 50875b246ed53e68be6f55bd7e05cfc2b2ed6432198a6444b6d8c247fab941f5
+ * CTX 69768b5c429366f1d3f00f0345b96123d56204c01c63b22ce78baf116e525ed9
+ * CTX 0fdea39fa469494d3866c31e05f295ff21fea8d4e6e13d67e47ce722e9698a1c
+ * CTX 1048d68ebcde76b86fcf976eab8aa9790268b7068e017a8b9b749409514f1053
+ * CTX 027fd16c3786ea1bac5f15cb79711ee2abe82f5cf8b13ae73030ef5b9e4457e7
+ * CTX 5d1304f988d62dd6fc4b94ed38ba831da4b7634971b6cd8ec325d9c61c00f1df
+ * CTX 73627ed3745a5e8489f3a95c69639c32cd6e1d537a85f75cc844726e8a72fc00
+ * CTX 77ad22000f1d5078f6b866318c668f1ad03d5a5fced5219f2eabbd0aa5c0f460
+ * CTX d183f04404a0d6f469558e81fab24a167905ab4c7878502ad3e38fdbe62a4155
+ * CTX 6cec37325759533ce8f25f367c87bb5578d667ae93f9e2fd99bcbc5f2fbba88c
+ * CTX f6516139420fcff3b7361d86322c4bd84c82f335abb152c4a93411373aaa8220
+ * Plaintext length (bytes): 512
+*/
+
+static uint8_t v13_key1[32] = {
+	0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+	0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+	0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+	0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v13_key2[32] = {
+	0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+	0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+	0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+	0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v13_TW[16] = {
+	0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v13_PTX[512] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v13_CTX[512] = {
+	0xbf, 0x53, 0xd2, 0xda, 0xde, 0x78, 0xe8, 0x22,
+	0xa4, 0xd9, 0x49, 0xa9, 0xbc, 0x67, 0x66, 0xb0,
+	0x1b, 0x06, 0xa8, 0xef, 0x70, 0xd2, 0x67, 0x48,
+	0xc6, 0xa7, 0xfc, 0x36, 0xd8, 0x0a, 0xe4, 0xc5,
+	0x52, 0x0f, 0x7c, 0x4a, 0xb0, 0xac, 0x85, 0x44,
+	0x42, 0x4f, 0xa4, 0x05, 0x16, 0x2f, 0xef, 0x5a,
+	0x6b, 0x7f, 0x22, 0x94, 0x98, 0x06, 0x36, 0x18,
+	0xd3, 0x9f, 0x00, 0x03, 0xcb, 0x5f, 0xb8, 0xd1,
+	0xc8, 0x6b, 0x64, 0x34, 0x97, 0xda, 0x1f, 0xf9,
+	0x45, 0xc8, 0xd3, 0xbe, 0xde, 0xca, 0x4f, 0x47,
+	0x97, 0x02, 0xa7, 0xa7, 0x35, 0xf0, 0x43, 0xdd,
+	0xb1, 0xd6, 0xaa, 0xad, 0xe3, 0xc4, 0xa0, 0xac,
+	0x7c, 0xa7, 0xf3, 0xfa, 0x52, 0x79, 0xbe, 0xf5,
+	0x6f, 0x82, 0xcd, 0x7a, 0x2f, 0x38, 0x67, 0x2e,
+	0x82, 0x48, 0x14, 0xe1, 0x07, 0x00, 0x30, 0x0a,
+	0x05, 0x5e, 0x16, 0x30, 0xb8, 0xf1, 0xcb, 0x0e,
+	0x91, 0x9f, 0x5e, 0x94, 0x20, 0x10, 0xa4, 0x16,
+	0xe2, 0xbf, 0x48, 0xcb, 0x46, 0x99, 0x3d, 0x3c,
+	0xb6, 0xa5, 0x1c, 0x19, 0xba, 0xcf, 0x86, 0x47,
+	0x85, 0xa0, 0x0b, 0xc2, 0xec, 0xff, 0x15, 0xd3,
+	0x50, 0x87, 0x5b, 0x24, 0x6e, 0xd5, 0x3e, 0x68,
+	0xbe, 0x6f, 0x55, 0xbd, 0x7e, 0x05, 0xcf, 0xc2,
+	0xb2, 0xed, 0x64, 0x32, 0x19, 0x8a, 0x64, 0x44,
+	0xb6, 0xd8, 0xc2, 0x47, 0xfa, 0xb9, 0x41, 0xf5,
+	0x69, 0x76, 0x8b, 0x5c, 0x42, 0x93, 0x66, 0xf1,
+	0xd3, 0xf0, 0x0f, 0x03, 0x45, 0xb9, 0x61, 0x23,
+	0xd5, 0x62, 0x04, 0xc0, 0x1c, 0x63, 0xb2, 0x2c,
+	0xe7, 0x8b, 0xaf, 0x11, 0x6e, 0x52, 0x5e, 0xd9,
+	0x0f, 0xde, 0xa3, 0x9f, 0xa4, 0x69, 0x49, 0x4d,
+	0x38, 0x66, 0xc3, 0x1e, 0x05, 0xf2, 0x95, 0xff,
+	0x21, 0xfe, 0xa8, 0xd4, 0xe6, 0xe1, 0x3d, 0x67,
+	0xe4, 0x7c, 0xe7, 0x22, 0xe9, 0x69, 0x8a, 0x1c,
+	0x10, 0x48, 0xd6, 0x8e, 0xbc, 0xde, 0x76, 0xb8,
+	0x6f, 0xcf, 0x97, 0x6e, 0xab, 0x8a, 0xa9, 0x79,
+	0x02, 0x68, 0xb7, 0x06, 0x8e, 0x01, 0x7a, 0x8b,
+	0x9b, 0x74, 0x94, 0x09, 0x51, 0x4f, 0x10, 0x53,
+	0x02, 0x7f, 0xd1, 0x6c, 0x37, 0x86, 0xea, 0x1b,
+	0xac, 0x5f, 0x15, 0xcb, 0x79, 0x71, 0x1e, 0xe2,
+	0xab, 0xe8, 0x2f, 0x5c, 0xf8, 0xb1, 0x3a, 0xe7,
+	0x30, 0x30, 0xef, 0x5b, 0x9e, 0x44, 0x57, 0xe7,
+	0x5d, 0x13, 0x04, 0xf9, 0x88, 0xd6, 0x2d, 0xd6,
+	0xfc, 0x4b, 0x94, 0xed, 0x38, 0xba, 0x83, 0x1d,
+	0xa4, 0xb7, 0x63, 0x49, 0x71, 0xb6, 0xcd, 0x8e,
+	0xc3, 0x25, 0xd9, 0xc6, 0x1c, 0x00, 0xf1, 0xdf,
+	0x73, 0x62, 0x7e, 0xd3, 0x74, 0x5a, 0x5e, 0x84,
+	0x89, 0xf3, 0xa9, 0x5c, 0x69, 0x63, 0x9c, 0x32,
+	0xcd, 0x6e, 0x1d, 0x53, 0x7a, 0x85, 0xf7, 0x5c,
+	0xc8, 0x44, 0x72, 0x6e, 0x8a, 0x72, 0xfc, 0x00,
+	0x77, 0xad, 0x22, 0x00, 0x0f, 0x1d, 0x50, 0x78,
+	0xf6, 0xb8, 0x66, 0x31, 0x8c, 0x66, 0x8f, 0x1a,
+	0xd0, 0x3d, 0x5a, 0x5f, 0xce, 0xd5, 0x21, 0x9f,
+	0x2e, 0xab, 0xbd, 0x0a, 0xa5, 0xc0, 0xf4, 0x60,
+	0xd1, 0x83, 0xf0, 0x44, 0x04, 0xa0, 0xd6, 0xf4,
+	0x69, 0x55, 0x8e, 0x81, 0xfa, 0xb2, 0x4a, 0x16,
+	0x79, 0x05, 0xab, 0x4c, 0x78, 0x78, 0x50, 0x2a,
+	0xd3, 0xe3, 0x8f, 0xdb, 0xe6, 0x2a, 0x41, 0x55,
+	0x6c, 0xec, 0x37, 0x32, 0x57, 0x59, 0x53, 0x3c,
+	0xe8, 0xf2, 0x5f, 0x36, 0x7c, 0x87, 0xbb, 0x55,
+	0x78, 0xd6, 0x67, 0xae, 0x93, 0xf9, 0xe2, 0xfd,
+	0x99, 0xbc, 0xbc, 0x5f, 0x2f, 0xbb, 0xa8, 0x8c,
+	0xf6, 0x51, 0x61, 0x39, 0x42, 0x0f, 0xcf, 0xf3,
+	0xb7, 0x36, 0x1d, 0x86, 0x32, 0x2c, 0x4b, 0xd8,
+	0x4c, 0x82, 0xf3, 0x35, 0xab, 0xb1, 0x52, 0xc4,
+	0xa9, 0x34, 0x11, 0x37, 0x3a, 0xaa, 0x82, 0x20
+};
+
+/*
+ * Vector 14
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ffffffffff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 64497e5a831e4a932c09be3e5393376daa599548b816031d224bbf50a818ed23
+ * CTX 50eae7e96087c8a0db51ad290bd00c1ac1620857635bf246c176ab463be30b80
+ * CTX 8da548081ac847b158e1264be25bb0910bbc92647108089415d45fab1b3d2604
+ * CTX e8a8eff1ae4020cfa39936b66827b23f371b92200be90251e6d73c5f86de5fd4
+ * CTX a950781933d79a28272b782a2ec313efdfcc0628f43d744c2dc2ff3dcb66999b
+ * CTX 50c7ca895b0c64791eeaa5f29499fb1c026f84ce5b5c72ba1083cddb5ce45434
+ * CTX 631665c333b60b11593fb253c5179a2c8db813782a004856a1653011e93fb6d8
+ * CTX 76c18366dd8683f53412c0c180f9c848592d593f8609ca736317d356e13e2bff
+ * CTX 3a9f59cd9aeb19cd482593d8c46128bb32423b37a9adfb482b99453fbe25a41b
+ * CTX f6feb4aa0bef5ed24bf73c762978025482c13115e4015aac992e5613a3b5c2f6
+ * CTX 85b84795cb6e9b2656d8c88157e52c42f978d8634c43d06fea928f2822e465aa
+ * CTX 6576e9bf419384506cc3ce3c54ac1a6f67dc66f3b30191e698380bc999b05abc
+ * CTX e19dc0c6dcc2dd001ec535ba18deb2df1a101023108318c75dc98611a09dc48a
+ * CTX 0acdec676fabdf222f07e026f059b672b56e5cbc8e1d21bbd867dd9272120546
+ * CTX 81d70ea737134cdfce93b6f82ae22423274e58a0821cc5502e2d0ab4585e94de
+ * CTX 6975be5e0b4efce51cd3e70c25a1fbbbd609d273ad5b0d59631c531f6a0a57b9
+ * Plaintext length (bytes): 512
+*/
+
+static uint8_t v14_key1[32] = {
+	0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+	0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+	0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+	0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v14_key2[32] = {
+	0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+	0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+	0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+	0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v14_TW[16] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v14_PTX[512] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v14_CTX[512] = {
+	0x64, 0x49, 0x7e, 0x5a, 0x83, 0x1e, 0x4a, 0x93,
+	0x2c, 0x09, 0xbe, 0x3e, 0x53, 0x93, 0x37, 0x6d,
+	0xaa, 0x59, 0x95, 0x48, 0xb8, 0x16, 0x03, 0x1d,
+	0x22, 0x4b, 0xbf, 0x50, 0xa8, 0x18, 0xed, 0x23,
+	0x50, 0xea, 0xe7, 0xe9, 0x60, 0x87, 0xc8, 0xa0,
+	0xdb, 0x51, 0xad, 0x29, 0x0b, 0xd0, 0x0c, 0x1a,
+	0xc1, 0x62, 0x08, 0x57, 0x63, 0x5b, 0xf2, 0x46,
+	0xc1, 0x76, 0xab, 0x46, 0x3b, 0xe3, 0x0b, 0x80,
+	0x8d, 0xa5, 0x48, 0x08, 0x1a, 0xc8, 0x47, 0xb1,
+	0x58, 0xe1, 0x26, 0x4b, 0xe2, 0x5b, 0xb0, 0x91,
+	0x0b, 0xbc, 0x92, 0x64, 0x71, 0x08, 0x08, 0x94,
+	0x15, 0xd4, 0x5f, 0xab, 0x1b, 0x3d, 0x26, 0x04,
+	0xe8, 0xa8, 0xef, 0xf1, 0xae, 0x40, 0x20, 0xcf,
+	0xa3, 0x99, 0x36, 0xb6, 0x68, 0x27, 0xb2, 0x3f,
+	0x37, 0x1b, 0x92, 0x20, 0x0b, 0xe9, 0x02, 0x51,
+	0xe6, 0xd7, 0x3c, 0x5f, 0x86, 0xde, 0x5f, 0xd4,
+	0xa9, 0x50, 0x78, 0x19, 0x33, 0xd7, 0x9a, 0x28,
+	0x27, 0x2b, 0x78, 0x2a, 0x2e, 0xc3, 0x13, 0xef,
+	0xdf, 0xcc, 0x06, 0x28, 0xf4, 0x3d, 0x74, 0x4c,
+	0x2d, 0xc2, 0xff, 0x3d, 0xcb, 0x66, 0x99, 0x9b,
+	0x50, 0xc7, 0xca, 0x89, 0x5b, 0x0c, 0x64, 0x79,
+	0x1e, 0xea, 0xa5, 0xf2, 0x94, 0x99, 0xfb, 0x1c,
+	0x02, 0x6f, 0x84, 0xce, 0x5b, 0x5c, 0x72, 0xba,
+	0x10, 0x83, 0xcd, 0xdb, 0x5c, 0xe4, 0x54, 0x34,
+	0x63, 0x16, 0x65, 0xc3, 0x33, 0xb6, 0x0b, 0x11,
+	0x59, 0x3f, 0xb2, 0x53, 0xc5, 0x17, 0x9a, 0x2c,
+	0x8d, 0xb8, 0x13, 0x78, 0x2a, 0x00, 0x48, 0x56,
+	0xa1, 0x65, 0x30, 0x11, 0xe9, 0x3f, 0xb6, 0xd8,
+	0x76, 0xc1, 0x83, 0x66, 0xdd, 0x86, 0x83, 0xf5,
+	0x34, 0x12, 0xc0, 0xc1, 0x80, 0xf9, 0xc8, 0x48,
+	0x59, 0x2d, 0x59, 0x3f, 0x86, 0x09, 0xca, 0x73,
+	0x63, 0x17, 0xd3, 0x56, 0xe1, 0x3e, 0x2b, 0xff,
+	0x3a, 0x9f, 0x59, 0xcd, 0x9a, 0xeb, 0x19, 0xcd,
+	0x48, 0x25, 0x93, 0xd8, 0xc4, 0x61, 0x28, 0xbb,
+	0x32, 0x42, 0x3b, 0x37, 0xa9, 0xad, 0xfb, 0x48,
+	0x2b, 0x99, 0x45, 0x3f, 0xbe, 0x25, 0xa4, 0x1b,
+	0xf6, 0xfe, 0xb4, 0xaa, 0x0b, 0xef, 0x5e, 0xd2,
+	0x4b, 0xf7, 0x3c, 0x76, 0x29, 0x78, 0x02, 0x54,
+	0x82, 0xc1, 0x31, 0x15, 0xe4, 0x01, 0x5a, 0xac,
+	0x99, 0x2e, 0x56, 0x13, 0xa3, 0xb5, 0xc2, 0xf6,
+	0x85, 0xb8, 0x47, 0x95, 0xcb, 0x6e, 0x9b, 0x26,
+	0x56, 0xd8, 0xc8, 0x81, 0x57, 0xe5, 0x2c, 0x42,
+	0xf9, 0x78, 0xd8, 0x63, 0x4c, 0x43, 0xd0, 0x6f,
+	0xea, 0x92, 0x8f, 0x28, 0x22, 0xe4, 0x65, 0xaa,
+	0x65, 0x76, 0xe9, 0xbf, 0x41, 0x93, 0x84, 0x50,
+	0x6c, 0xc3, 0xce, 0x3c, 0x54, 0xac, 0x1a, 0x6f,
+	0x67, 0xdc, 0x66, 0xf3, 0xb3, 0x01, 0x91, 0xe6,
+	0x98, 0x38, 0x0b, 0xc9, 0x99, 0xb0, 0x5a, 0xbc,
+	0xe1, 0x9d, 0xc0, 0xc6, 0xdc, 0xc2, 0xdd, 0x00,
+	0x1e, 0xc5, 0x35, 0xba, 0x18, 0xde, 0xb2, 0xdf,
+	0x1a, 0x10, 0x10, 0x23, 0x10, 0x83, 0x18, 0xc7,
+	0x5d, 0xc9, 0x86, 0x11, 0xa0, 0x9d, 0xc4, 0x8a,
+	0x0a, 0xcd, 0xec, 0x67, 0x6f, 0xab, 0xdf, 0x22,
+	0x2f, 0x07, 0xe0, 0x26, 0xf0, 0x59, 0xb6, 0x72,
+	0xb5, 0x6e, 0x5c, 0xbc, 0x8e, 0x1d, 0x21, 0xbb,
+	0xd8, 0x67, 0xdd, 0x92, 0x72, 0x12, 0x05, 0x46,
+	0x81, 0xd7, 0x0e, 0xa7, 0x37, 0x13, 0x4c, 0xdf,
+	0xce, 0x93, 0xb6, 0xf8, 0x2a, 0xe2, 0x24, 0x23,
+	0x27, 0x4e, 0x58, 0xa0, 0x82, 0x1c, 0xc5, 0x50,
+	0x2e, 0x2d, 0x0a, 0xb4, 0x58, 0x5e, 0x94, 0xde,
+	0x69, 0x75, 0xbe, 0x5e, 0x0b, 0x4e, 0xfc, 0xe5,
+	0x1c, 0xd3, 0xe7, 0x0c, 0x25, 0xa1, 0xfb, 0xbb,
+	0xd6, 0x09, 0xd2, 0x73, 0xad, 0x5b, 0x0d, 0x59,
+	0x63, 0x1c, 0x53, 0x1f, 0x6a, 0x0a, 0x57, 0xb9
+};
+
+//
+// Define vector of structs, with pointers to the statically defined vectors
+
+struct xts_vector vlist[NVEC] = {
+
+	// pointers to the statically defined vectors here
+
+	// Vector 10
+	{sizeof(v10_CTX), v10_key1, v10_key2, v10_TW, v10_PTX, v10_CTX}
+	,
+	// Vector 11
+	{sizeof(v11_CTX), v11_key1, v11_key2, v11_TW, v11_PTX, v11_CTX}
+	,
+	// Vector 12
+	{sizeof(v12_CTX), v12_key1, v12_key2, v12_TW, v12_PTX, v12_CTX}
+	,
+	// Vector 13
+	{sizeof(v13_CTX), v13_key1, v13_key2, v13_TW, v13_PTX, v13_CTX}
+	,
+	// Vector 14
+	{sizeof(v14_CTX), v14_key1, v14_key2, v14_TW, v14_PTX, v14_CTX}
+
+};
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm
new file mode 100644
index 000000000..416da1e7b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm
@@ -0,0 +1,78 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%include "reg_sizes.asm"
+
+extern XTS_AES_128_enc_sse
+extern XTS_AES_128_enc_avx
+
+extern XTS_AES_128_enc_expanded_key_sse
+extern XTS_AES_128_enc_expanded_key_avx
+
+extern XTS_AES_128_dec_sse
+extern XTS_AES_128_dec_avx
+
+extern XTS_AES_128_dec_expanded_key_sse
+extern XTS_AES_128_dec_expanded_key_avx
+
+%if (AS_FEATURE_LEVEL) >= 10
+extern XTS_AES_128_enc_vaes
+extern XTS_AES_128_enc_expanded_key_vaes
+extern XTS_AES_128_dec_vaes
+extern XTS_AES_128_dec_expanded_key_vaes
+%endif
+
+section .text
+
+%include "multibinary.asm"
+
+;;;;
+; instantiate XTS_AES_128_enc, XTS_AES_128_enc_expanded_key, XTS_AES_128_dec, and XTS_AES_128_dec_expanded_key
+;;;;
+mbin_interface     XTS_AES_128_enc
+mbin_dispatch_init7 XTS_AES_128_enc, XTS_AES_128_enc_sse, XTS_AES_128_enc_sse, XTS_AES_128_enc_avx, XTS_AES_128_enc_avx, XTS_AES_128_enc_avx, XTS_AES_128_enc_vaes
+
+mbin_interface     XTS_AES_128_enc_expanded_key
+mbin_dispatch_init7 XTS_AES_128_enc_expanded_key, XTS_AES_128_enc_expanded_key_sse, XTS_AES_128_enc_expanded_key_sse, XTS_AES_128_enc_expanded_key_avx, XTS_AES_128_enc_expanded_key_avx, XTS_AES_128_enc_expanded_key_avx, XTS_AES_128_enc_expanded_key_vaes
+
+mbin_interface     XTS_AES_128_dec
+mbin_dispatch_init7 XTS_AES_128_dec, XTS_AES_128_dec_sse, XTS_AES_128_dec_sse, XTS_AES_128_dec_avx, XTS_AES_128_dec_avx, XTS_AES_128_dec_avx, XTS_AES_128_dec_vaes
+
+mbin_interface     XTS_AES_128_dec_expanded_key
+mbin_dispatch_init7 XTS_AES_128_dec_expanded_key, XTS_AES_128_dec_expanded_key_sse, XTS_AES_128_dec_expanded_key_sse, XTS_AES_128_dec_expanded_key_avx, XTS_AES_128_dec_expanded_key_avx, XTS_AES_128_dec_expanded_key_avx, XTS_AES_128_dec_expanded_key_vaes
+
+
+;;;       func            		core, ver, snum
+slversion XTS_AES_128_enc, 01,  04,  0071
+slversion XTS_AES_128_enc_expanded_key, 01,  04,  0072
+slversion XTS_AES_128_dec, 01,  04,  0073
+slversion XTS_AES_128_dec_expanded_key, 01,  04,  0074
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm
new file mode 100644
index 000000000..33f376d5c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm
@@ -0,0 +1,78 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%include "reg_sizes.asm"
+
+extern XTS_AES_256_enc_sse
+extern XTS_AES_256_enc_avx
+
+extern XTS_AES_256_enc_expanded_key_sse
+extern XTS_AES_256_enc_expanded_key_avx
+
+extern XTS_AES_256_dec_sse
+extern XTS_AES_256_dec_avx
+
+extern XTS_AES_256_dec_expanded_key_sse
+extern XTS_AES_256_dec_expanded_key_avx
+
+%if (AS_FEATURE_LEVEL) >= 10
+extern XTS_AES_256_enc_vaes
+extern XTS_AES_256_enc_expanded_key_vaes
+extern XTS_AES_256_dec_vaes
+extern XTS_AES_256_dec_expanded_key_vaes
+%endif
+
+section .text
+
+%include "multibinary.asm"
+
+;;;;
+; instantiate XTS_AES_256_enc, XTS_AES_256_enc_expanded_key, XTS_AES_256_dec, and XTS_AES_256_dec_expanded_key
+;;;;
+mbin_interface     XTS_AES_256_enc
+mbin_dispatch_init7 XTS_AES_256_enc, XTS_AES_256_enc_sse, XTS_AES_256_enc_sse, XTS_AES_256_enc_avx, XTS_AES_256_enc_avx, XTS_AES_256_enc_avx, XTS_AES_256_enc_vaes
+
+mbin_interface     XTS_AES_256_enc_expanded_key
+mbin_dispatch_init7 XTS_AES_256_enc_expanded_key, XTS_AES_256_enc_expanded_key_sse, XTS_AES_256_enc_expanded_key_sse, XTS_AES_256_enc_expanded_key_avx, XTS_AES_256_enc_expanded_key_avx, XTS_AES_256_enc_expanded_key_avx, XTS_AES_256_enc_expanded_key_vaes
+
+mbin_interface     XTS_AES_256_dec
+mbin_dispatch_init7 XTS_AES_256_dec, XTS_AES_256_dec_sse, XTS_AES_256_dec_sse, XTS_AES_256_dec_avx, XTS_AES_256_dec_avx, XTS_AES_256_dec_avx, XTS_AES_256_dec_vaes
+
+mbin_interface     XTS_AES_256_dec_expanded_key
+mbin_dispatch_init7 XTS_AES_256_dec_expanded_key, XTS_AES_256_dec_expanded_key_sse, XTS_AES_256_dec_expanded_key_sse, XTS_AES_256_dec_expanded_key_avx, XTS_AES_256_dec_expanded_key_avx, XTS_AES_256_dec_expanded_key_avx, XTS_AES_256_dec_expanded_key_vaes
+
+
+;;;       func            		core, ver, snum
+slversion XTS_AES_256_enc, 01,  04,  0076
+slversion XTS_AES_256_enc_expanded_key, 01,  04,  0077
+slversion XTS_AES_256_dec, 01,  04,  0078
+slversion XTS_AES_256_dec_expanded_key, 01,  04,  0079
diff --git a/src/crypto/isa-l/isa-l_crypto/autogen.sh b/src/crypto/isa-l/isa-l_crypto/autogen.sh
new file mode 100755
index 000000000..0a3189383
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/autogen.sh
@@ -0,0 +1,17 @@
+#!/bin/sh -e
+
+autoreconf --install --symlink -f
+
+libdir() {
+        echo $(cd $1/$(gcc -print-multi-os-directory); pwd)
+}
+
+args="--prefix=/usr --libdir=$(libdir /usr/lib)"
+
+echo
+echo "----------------------------------------------------------------"
+echo "Initialized build system. For a common configuration please run:"
+echo "----------------------------------------------------------------"
+echo
+echo "./configure $args"
+echo
diff --git a/src/crypto/isa-l/isa-l_crypto/configure.ac b/src/crypto/isa-l/isa-l_crypto/configure.ac
new file mode 100644
index 000000000..70f9cc88d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/configure.ac
@@ -0,0 +1,349 @@
+#                                               -*- Autoconf -*-
+# Process this file with autoconf to produce a configure script.
+
+AC_PREREQ(2.69)
+AC_INIT([libisal_crypto],
+        [2.24.0],
+        [sg.support.isal@intel.com],
+        [isa-l_crypto],
+        [http://01.org/storage-acceleration-library])
+AC_CONFIG_SRCDIR([])
+AC_CONFIG_AUX_DIR([build-aux])
+AM_INIT_AUTOMAKE([
+	foreign
+	1.11
+	-Wall
+	-Wno-portability
+	silent-rules
+	tar-pax
+	no-dist-gzip
+	dist-xz
+	subdir-objects
+])
+AM_PROG_AS
+
+AC_CANONICAL_HOST
+CPU=""
+AS_CASE([$host_cpu],
+  [x86_64], [CPU="x86_64"],
+  [amd64], [CPU="x86_64"],
+  [i?86], [CPU="x86_32"],
+  [aarch64], [CPU="aarch64"],
+  [arm64], [CPU="aarch64"],
+)
+AM_CONDITIONAL([CPU_X86_64], [test "$CPU" = "x86_64"])
+AM_CONDITIONAL([CPU_X86_32], [test "$CPU" = "x86_32"])
+AM_CONDITIONAL([CPU_AARCH64], [test "$CPU" = "aarch64"])
+AM_CONDITIONAL([CPU_UNDEFINED], [test "x$CPU" = "x"])
+
+if test "$CPU" = "x86_64"; then
+   is_x86=yes
+else
+   if test "$CPU" = "x86_32"; then
+      is_x86=yes
+   else
+      is_x86=no
+   fi
+fi
+
+# Check for programs
+AC_PROG_CC_STDC
+AC_USE_SYSTEM_EXTENSIONS
+AM_SILENT_RULES([yes])
+LT_INIT
+AC_PREFIX_DEFAULT([/usr])
+AC_PROG_SED
+AC_PROG_MKDIR_P
+
+# Options
+AC_ARG_ENABLE([debug],
+        AS_HELP_STRING([--enable-debug], [enable debug messages @<:@default=disabled@:>@]),
+        [], [enable_debug=no])
+AS_IF([test "x$enable_debug" = "xyes"], [
+        AC_DEFINE(ENABLE_DEBUG, [1], [Debug messages.])
+])
+# If this build is for x86, look for yasm and nasm
+if test x"$is_x86" = x"yes"; then
+AC_MSG_CHECKING([whether Intel CET is enabled])
+AC_TRY_COMPILE([],[
+#ifndef __CET__
+# error CET is not enabled
+#endif],
+                 [AC_MSG_RESULT([yes])
+                  intel_cet_enabled=yes],
+                 [AC_MSG_RESULT([no])
+                  intel_cet_enabled=no])
+
+
+  # Pick an assembler yasm or nasm
+  if test x"$AS" = x""; then
+    # Check for yasm and yasm features
+    yasm_feature_level=0
+    AC_CHECK_PROG(HAVE_YASM, yasm, yes, no)
+    if test "$HAVE_YASM" = "yes"; then
+       yasm_feature_level=1
+    else
+       AC_MSG_RESULT([no yasm])
+    fi
+    if test x"$yasm_feature_level" = x"1"; then
+       AC_MSG_CHECKING([for modern yasm])
+       AC_LANG_CONFTEST([AC_LANG_SOURCE([[vmovdqa %xmm0, %xmm1;]])])
+       if yasm -f elf64 -p gas conftest.c ; then
+         AC_MSG_RESULT([yes])
+         yasm_feature_level=4
+       else
+         AC_MSG_RESULT([no])
+       fi
+    fi
+    if test x"$yasm_feature_level" = x"4"; then
+       AC_MSG_CHECKING([for optional yasm AVX512 support])
+       AC_LANG_CONFTEST([AC_LANG_SOURCE([[vpshufb %zmm0, %zmm1, %zmm2;]])])
+       if yasm -f elf64 -p gas conftest.c 2> /dev/null; then
+         AC_MSG_RESULT([yes])
+         yasm_feature_level=6
+       else
+         AC_MSG_RESULT([no])
+       fi
+    fi
+    if test x"$yasm_feature_level" = x"6"; then
+      AC_MSG_CHECKING([for additional yasm AVX512 support])
+      AC_LANG_CONFTEST([AC_LANG_SOURCE([[vpcompressb zmm0, k1, zmm1;]])])
+      sed -i -e '/vpcompressb/!d' conftest.c
+      if yasm -f elf64  conftest.c 2> /dev/null; then
+        AC_MSG_RESULT([yes])
+        yasm_feature_level=10
+      else
+        AC_MSG_RESULT([no])
+      fi
+    fi
+
+    AC_MSG_CHECKING([for optional yasm SHA-NI support])
+    AC_LANG_CONFTEST([AC_LANG_SOURCE([[sha256rnds2 %xmm0,%xmm1,%xmm2;]])])
+    if yasm -f elf64 -p gas conftest.c 2> /dev/null; then
+      yasm_knows_shani=yes
+      AC_MSG_RESULT([yes])
+    else
+      AC_MSG_RESULT([no])
+    fi
+
+    # Check for nasm and nasm features
+    nasm_feature_level=0
+    AC_CHECK_PROG(HAVE_NASM, nasm, yes, no)
+    if test "$HAVE_NASM" = "yes"; then
+       nasm_feature_level=1
+    else
+       AC_MSG_RESULT([no nasm])
+    fi
+
+    if test x"$nasm_feature_level" = x"1"; then
+      AC_MSG_CHECKING([for modern nasm])
+      AC_LANG_CONFTEST([AC_LANG_SOURCE([[pblendvb xmm2, xmm1;]])])
+      sed -i -e '/pblendvb/!d' conftest.c
+      if nasm -f elf64 conftest.c 2> /dev/null; then
+        AC_MSG_RESULT([yes])
+        nasm_feature_level=4
+      else
+        AC_MSG_RESULT([no])
+      fi
+    fi
+    if test x"$nasm_feature_level" = x"4"; then
+      AC_MSG_CHECKING([for optional nasm AVX512 support])
+      AC_LANG_CONFTEST([AC_LANG_SOURCE([[vinserti32x8 zmm0, ymm1, 1;]])])
+      sed -i -e '/vinsert/!d' conftest.c
+      if nasm -f elf64  conftest.c 2> /dev/null; then
+        AC_MSG_RESULT([yes])
+        nasm_feature_level=6
+      else
+        AC_MSG_RESULT([no])
+      fi
+    fi
+    if test x"$nasm_feature_level" = x"6"; then
+      AC_MSG_CHECKING([for additional nasm AVX512 support])
+      AC_LANG_CONFTEST([AC_LANG_SOURCE([[vpcompressb zmm0 {k1}, zmm1;]])])
+      sed -i -e '/vpcompressb/!d' conftest.c
+      if nasm -f elf64  conftest.c 2> /dev/null; then
+        AC_MSG_RESULT([yes])
+        nasm_feature_level=10
+      else
+        AC_MSG_RESULT([no])
+      fi
+    fi
+
+    AC_MSG_CHECKING([for optional nasm SHA-NI support])
+    AC_LANG_CONFTEST([AC_LANG_SOURCE([[sha256rnds2 xmm2,xmm1,xmm0;]])])
+    sed -i -e '/sha256rnds2/!d' conftest.c
+    if nasm -f elf64  conftest.c 2> /dev/null; then
+      nasm_knows_shani=yes
+      AC_MSG_RESULT([yes])
+    else
+      AC_MSG_RESULT([no])
+    fi
+
+    if test $nasm_feature_level -ge $yasm_feature_level ; then
+      AS=nasm
+      as_feature_level=$nasm_feature_level
+      as_knows_shani=$nasm_knows_shani
+    else
+      AS=yasm
+      as_feature_level=$yasm_feature_level
+      as_knows_shani=$yasm_knows_shani
+    fi
+
+  else
+    # Check for $AS supported features
+    as_feature_level=0
+    AC_CHECK_PROG(HAVE_AS, $AS, yes, no)
+    if test "$HAVE_AS" = "yes"; then
+       as_feature_level=1
+    else
+       AC_MSG_ERROR([no $AS])
+    fi
+
+    if test x"$as_feature_level" = x"1"; then
+      AC_MSG_CHECKING([for modern $AS])
+      AC_LANG_CONFTEST([AC_LANG_SOURCE([[pblendvb xmm2, xmm1;]])])
+      sed -i -e '/pblendvb/!d' conftest.c
+      if $AS -f elf64 conftest.c 2> /dev/null; then
+        AC_MSG_RESULT([yes])
+        as_feature_level=4
+      else
+        AC_MSG_RESULT([no])
+      fi
+    fi
+    if test x"$as_feature_level" = x"4"; then
+      AC_MSG_CHECKING([for optional as AVX512 support])
+      AC_LANG_CONFTEST([AC_LANG_SOURCE([[vinserti32x8 zmm0, ymm1, 1;]])])
+      sed -i -e '/vinsert/!d' conftest.c
+      if $AS -f elf64  conftest.c 2> /dev/null; then
+        AC_MSG_RESULT([yes])
+        as_feature_level=6
+      else
+        AC_MSG_RESULT([no])
+      fi
+    fi
+    if test x"$as_feature_level" = x"6"; then
+      AC_MSG_CHECKING([for additional as AVX512 support])
+      AC_LANG_CONFTEST([AC_LANG_SOURCE([[vpcompressb zmm0, k1, zmm1;]])])
+      sed -i -e '/vpcompressb/!d' conftest.c
+      if $AS -f elf64  conftest.c 2> /dev/null; then
+        AC_MSG_RESULT([yes])
+        as_feature_level=10
+      else
+        AC_MSG_RESULT([no])
+      fi
+    fi
+
+    AC_MSG_CHECKING([for optional nasm SHA-NI support])
+    AC_LANG_CONFTEST([AC_LANG_SOURCE([[sha256rnds2 xmm2,xmm1,xmm0;]])])
+    sed -i -e '/sha256rnds2/!d' conftest.c
+    if $AS -f elf64  conftest.c 2> /dev/null; then
+      AC_MSG_RESULT([yes])
+      as_knows_shani=yes
+    else
+      AC_MSG_RESULT([no])
+    fi
+
+  fi
+
+  if test $as_feature_level -lt 2 ; then
+    AC_MSG_ERROR([No modern nasm or yasm found as required. Nasm should be v2.11.01 or later (v2.13 for AVX512) and yasm should be 1.2.0 or later.])
+  fi
+
+  if test x"$as_knows_shani" = x"yes"; then
+    AC_DEFINE(HAVE_AS_KNOWS_SHANI, [1], [Assembler can do SHANI.])
+    have_as_knows_shani=yes
+  else
+    AC_MSG_RESULT([Assembler does not understand SHANI opcodes.  Consider upgrading for best performance.])
+  fi
+
+  case $host_os in
+       *linux*)  arch=linux   yasm_args="-f elf64";;
+       *darwin*) arch=darwin  yasm_args="-f macho64 --prefix=_ ";;
+       *netbsd*) arch=netbsd  yasm_args="-f elf64";;
+       *mingw*)  arch=mingw   yasm_args="-f win64";;
+       *)        arch=unknown yasm_args="-f elf64";;
+  esac
+
+  # Fix for nasm missing windows features
+  if test x"$arch" = x"mingw"; then
+    AS=yasm
+    as_feature_level=$yasm_feature_level
+    if test $as_feature_level -lt 2 ; then
+      AC_MSG_ERROR([Mingw build requires Yasm 1.2.0 or later.])
+    fi
+  fi
+
+  AC_DEFINE_UNQUOTED(AS_FEATURE_LEVEL, [$as_feature_level], [Assembler feature level.])
+  if test $as_feature_level -ge 6 ; then
+    AC_DEFINE(HAVE_AS_KNOWS_AVX512, [1], [Assembler can do AVX512.])
+    have_as_knows_avx512=yes
+  else
+    AC_MSG_RESULT([Assembler does not understand AVX512 opcodes.  Consider upgrading for best performance.])
+  fi
+
+  AM_CONDITIONAL(USE_YASM, test x"$AS" = x"yasm")
+  AM_CONDITIONAL(USE_NASM, test x"$AS" = x"nasm")
+  AM_CONDITIONAL(WITH_AVX512, test x"$have_as_knows_avx512" = x"yes")
+  AM_CONDITIONAL(WITH_SHANI, test x"$have_as_knows_shani" = x"yes")
+  AC_SUBST([yasm_args])
+  AM_CONDITIONAL(DARWIN, test x"$arch" = x"darwin")
+  AC_MSG_RESULT([Using $AS args target "$arch" "$yasm_args"])
+else
+  # Disable below conditionals if not x86
+  AM_CONDITIONAL(USE_YASM, test "x" = "y")
+  AM_CONDITIONAL(USE_NASM, test "x" = "y")
+  AM_CONDITIONAL(WITH_AVX512, test "x" = "y")
+  AM_CONDITIONAL(WITH_SHANI, test "x" = "y")
+  AM_CONDITIONAL(DARWIN, test "x" = "y")
+fi
+
+AM_CONDITIONAL(INTEL_CET_ENABLED, [test x"$intel_cet_enabled" = x"yes"])
+
+# Check for header files
+AC_CHECK_HEADERS([limits.h stdint.h stdlib.h string.h])
+
+# Checks for typedefs, structures, and compiler characteristics.
+AC_C_INLINE
+AC_TYPE_SIZE_T
+AC_TYPE_UINT16_T
+AC_TYPE_UINT32_T
+AC_TYPE_UINT64_T
+AC_TYPE_UINT8_T
+
+# Checks for library functions.
+AC_FUNC_MALLOC  # Used only in tests
+AC_CHECK_FUNCS([memmove memset])
+
+my_CFLAGS="\
+-Wall \
+-Wchar-subscripts \
+-Wformat-security \
+-Wnested-externs \
+-Wpointer-arith \
+-Wshadow \
+-Wstrict-prototypes \
+-Wtype-limits \
+"
+AC_SUBST([my_CFLAGS])
+
+AC_CONFIG_FILES([\
+  Makefile\
+  libisal_crypto.pc
+])
+
+AC_OUTPUT
+AC_MSG_RESULT([
+        $PACKAGE $VERSION
+        =====
+
+        prefix:                 ${prefix}
+        sysconfdir:             ${sysconfdir}
+        libdir:                 ${libdir}
+        includedir:             ${includedir}
+
+        compiler:               ${CC}
+        cflags:                 ${CFLAGS}
+        ldflags:                ${LDFLAGS}
+
+        debug:                  ${enable_debug}
+])
diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/Makefile b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/Makefile
new file mode 100644
index 000000000..41e9e29f2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/Makefile
@@ -0,0 +1,27 @@
+
+INCLUDE = /usr/include
+CFLAGS = -O2 -I$(INCLUDE)
+LDLIBS = -lisal_crypto -lcrypto -lpthread
+test = isal_multithread_perf
+
+source += isal_multithread_perf.c
+source += md5_thread.c \
+        sha1_thread.c \
+        sha256_thread.c \
+        sha512_thread.c \
+        aes_thread.c
+
+ODIR = bin
+objects = $(addprefix $(ODIR)/, $(patsubst %.c, %.o, $(source)))
+
+$(test): $(objects)
+	gcc   $? $(LDLIBS) -o $@
+
+$(ODIR): ; mkdir -p $(ODIR)
+$(objects): | $(ODIR)
+$(ODIR)/%.o: %.c
+	gcc -c  $(CFLAGS) $< -o $@
+
+clean:
+	@echo Cleaning up
+	@rm -fr $(ODIR) $(test)
diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/README.txt b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/README.txt
new file mode 100644
index 000000000..60335f76c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/README.txt
@@ -0,0 +1,25 @@
+/*
+ * Saturation Test
+ * Written by Xiaodong Liu <xiaodong.liu@intel.com>
+ */
+
+This tool is used to judge the saturation performance of ISA-L's multi-buffer hash and other algorithms.
+It can be used to give a comparision between multi-buffer hash and OpenSSL's single buffer hash.
+
+Compilation:
+(Make sure isa-l_crypto library is already installed. Other libs requried are openssl and pthread.)
+make
+
+Usage: ./isal_multithread_perf -n num_threads
+        -v verbose output
+        -t time to run(secs)
+        -n number of algorithm threads
+        -l len of each buffer(KB)
+        -a memory copy before algorithm -- 1 do(default); 0 not do
+        -b memory copy after algorithm -- 1 do(default); 0 not do
+        -m method of algorithm:  md5  md5_mb  sha1  sha1_mb  sha256  sha256_mb
+          sha512  sha512_mb  cbc_128_dec  cbc_192_dec  cbc_256_dec  xts_128_enc
+          xts_256_enc  gcm_128_enc  gcm_256_enc
+
+Example:
+./isal_multithread_perf -m md5 -n 10
diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/aes_thread.c b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/aes_thread.c
new file mode 100644
index 000000000..366fc9bcf
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/aes_thread.c
@@ -0,0 +1,380 @@
+
+#include <pthread.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "isal_multithread_perf.h"
+
+struct aes_context {
+	int const bits;
+	int (*const preproc)(struct aes_context * pCtx);
+	void (*const processor)(struct aes_context * pCtx, char *plaintext,
+				char *ciphertext, uint64_t len);
+	void (*const postproc)(struct aes_context * pCtx);
+};
+
+#define rounds_buf 2		/* first one is plain text, second is cipher text */
+
+static uint64_t aes_thread_func(int32_t id, struct aes_context *pCtx)
+{
+	uint32_t i = 0, j = 0;
+	char *aes_buf[rounds_buf] = { NULL };	/* aes buf is used to do checksum compute */
+	char *carry_buf[rounds_buf] = { NULL };	/* carry buf is used to do memory movement */
+	uint64_t round = -1;
+	struct timeval start_tv, stop_tv;
+	long long secs = run_secs;
+
+	printfv("Thread %i is started\n", id);
+	/* memory allocate */
+	for (j = 0; j < rounds_buf; j++) {
+		carry_buf[j] = (char *)calloc((size_t)buflen, 1);
+		if (carry_buf[j] == NULL) {
+			printf("calloc failed test aborted\n");
+			goto out;
+		}
+
+		aes_buf[j] = (char *)calloc((size_t)buflen, 1);
+		if (aes_buf[j] == NULL) {
+			printf("calloc failed test aborted\n");
+			goto out;
+		}
+
+		/* Create the random data */
+		for (i = 0; i < buflen; i += 1024) {
+			carry_buf[j][i] = i % 256;
+			aes_buf[j][i] = i % 256;
+		}
+	}
+
+	if (pCtx->preproc(pCtx)) {
+		printf("preproc failed test aborted\n");
+		goto out;
+	}
+
+	/* Thread sync */
+	pthread_mutex_lock(&count_lock);
+	count++;
+	if (count == num_threads) {
+		pthread_cond_broadcast(&count_cond);
+	} else {
+		pthread_cond_wait(&count_cond, &count_lock);
+	}
+	pthread_mutex_unlock(&count_lock);
+
+	printfv("Thread %i is ready\n", id);
+	/* hash func starts to run */
+	round = 0;
+	gettimeofday(&start_tv, 0);
+	gettimeofday(&stop_tv, 0);
+	while (secs > (stop_tv.tv_sec - start_tv.tv_sec)) {
+		/* Pre mem-operation */
+		if (prememcpy)
+			memcpy(aes_buf[0], carry_buf[0], buflen);
+
+		/* Calculate checksum */
+		pCtx->processor(pCtx, aes_buf[0], aes_buf[1], buflen);
+
+		/* Post mem-operation */
+		if (postmemcpy)
+			memcpy(carry_buf[1], aes_buf[1], buflen);
+
+		round++;
+
+		gettimeofday(&stop_tv, 0);
+	}
+	printfv("thread %2i, aes_func rounds %ld\n", id, round);
+
+      out:
+	pCtx->postproc(pCtx);
+
+	for (j = 0; j < rounds_buf; j++) {
+		free(carry_buf[j]);
+		free(aes_buf[j]);
+	}
+
+	return round;
+}
+
+/*
+ * facilities for AES-CBC
+ */
+static unsigned char const ic[] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
+	0x0e, 0x0f
+};
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+	unsigned int i;
+	for (i = 0; i < size; i++) {
+		*data++ = rand();
+	}
+}
+
+/* thread functions for cbc dec */
+struct cbc_context {
+	struct aes_context base;
+	uint8_t *iv;
+	uint8_t key[CBC_256_BITS];
+	struct cbc_key_data *key_data;
+};
+
+static int cbc_dec_pre(struct aes_context *p)
+{
+	struct cbc_context *pCtx = (struct cbc_context *)p;
+	int ret;
+
+	ret = posix_memalign((void **)&pCtx->iv, 16, (CBC_IV_DATA_LEN));
+	ret |= posix_memalign((void **)&pCtx->key_data, 16, (sizeof(*pCtx->key_data)));
+
+	if ((0 != ret) || (NULL == pCtx->iv) || (NULL == pCtx->key_data))
+		return 1;
+
+	mk_rand_data(pCtx->key, sizeof(pCtx->key));
+	memcpy(pCtx->iv, ic, CBC_IV_DATA_LEN);
+	aes_cbc_precomp(pCtx->key, pCtx->base.bits, pCtx->key_data);
+
+	return 0;
+}
+
+static void cbc_dec_post(struct aes_context *p)
+{
+	struct cbc_context *pCtx = (struct cbc_context *)p;
+
+	free(pCtx->iv);
+	free(pCtx->key_data);
+
+	return;
+}
+
+static void cbc_dec_proc(struct aes_context *p, char *plaintext, char *ciphertext,
+			 uint64_t len)
+{
+	struct cbc_context *pCtx = (struct cbc_context *)p;
+
+	if (pCtx->base.bits == 128)
+		aes_cbc_dec_128(ciphertext, pCtx->iv, pCtx->key_data->dec_keys, plaintext,
+				len);
+	else if (pCtx->base.bits == 192)
+		aes_cbc_dec_192(ciphertext, pCtx->iv, pCtx->key_data->dec_keys, plaintext,
+				len);
+	else if (pCtx->base.bits == 256)
+		aes_cbc_dec_256(ciphertext, pCtx->iv, pCtx->key_data->dec_keys, plaintext,
+				len);
+	else {
+		printf("unsupported cbc encryption bits %d\n", pCtx->base.bits);
+		exit(1);
+	}
+
+	return;
+}
+
+void *cbc_128_dec_func(void *arg)
+{
+	int32_t id = *((int *)arg);
+	uint64_t round = -1;
+
+	struct cbc_context ctx =
+	    { {128, cbc_dec_pre, cbc_dec_proc, cbc_dec_post}, NULL, {0}, NULL };
+
+	round = aes_thread_func(id, &ctx.base);
+
+	pthread_exit((void *)round);
+}
+
+void *cbc_192_dec_func(void *arg)
+{
+	int32_t id = *((int *)arg);
+	uint64_t round = -1;
+
+	struct cbc_context ctx =
+	    { {192, cbc_dec_pre, cbc_dec_proc, cbc_dec_post}, NULL, {0}, NULL };
+
+	round = aes_thread_func(id, &ctx.base);
+
+	pthread_exit((void *)round);
+}
+
+void *cbc_256_dec_func(void *arg)
+{
+	int32_t id = *((int *)arg);
+	uint64_t round = -1;
+
+	struct cbc_context ctx =
+	    { {256, cbc_dec_pre, cbc_dec_proc, cbc_dec_post}, NULL, {0}, NULL };
+
+	round = aes_thread_func(id, &ctx.base);
+
+	pthread_exit((void *)round);
+}
+
+/*
+ * thread functions for xts enc
+ */
+struct xts_content {
+	struct aes_context base;
+	unsigned char key1[16 * 2];
+	unsigned char key2[16 * 2];
+	unsigned char tinit[16];
+};
+
+static int xts_enc_pre(struct aes_context *p)
+{
+	struct xts_content *pCtx = (struct xts_content *)p;
+
+	mk_rand_data(pCtx->key1, pCtx->base.bits / 8);
+	mk_rand_data(pCtx->key2, pCtx->base.bits / 8);
+	mk_rand_data(pCtx->tinit, sizeof(pCtx->tinit));
+
+	return 0;
+}
+
+static void xts_enc_post(struct aes_context *p)
+{
+	return;
+}
+
+static void xts_enc_proc(struct aes_context *p, char *plaintext, char *ciphertext,
+			 uint64_t len)
+{
+	struct xts_content *pCtx = (struct xts_content *)p;
+
+	if (pCtx->base.bits == 128)
+		XTS_AES_128_enc(pCtx->key2, pCtx->key1, pCtx->tinit, len, plaintext,
+				ciphertext);
+	else if (pCtx->base.bits == 256)
+		XTS_AES_256_enc(pCtx->key2, pCtx->key1, pCtx->tinit, len, plaintext,
+				ciphertext);
+	else {
+		printf("unsupported xts encryption bits %d\n", pCtx->base.bits);
+		exit(1);
+	}
+
+	return;
+}
+
+void *xts_128_enc_func(void *arg)
+{
+	int32_t id = *((int *)arg);
+	uint64_t round = -1;
+
+	struct xts_content ctx =
+	    { {128, xts_enc_pre, xts_enc_proc, xts_enc_post}, {0}, {0}, {0} };
+
+	round = aes_thread_func(id, &ctx.base);
+
+	pthread_exit((void *)round);
+}
+
+void *xts_256_enc_func(void *arg)
+{
+	int32_t id = *((int *)arg);
+	uint64_t round = -1;
+
+	struct xts_content ctx =
+	    { {256, xts_enc_pre, xts_enc_proc, xts_enc_post}, {0}, {0}, {0} };
+
+	round = aes_thread_func(id, &ctx.base);
+
+	pthread_exit((void *)round);
+}
+
+/*
+ * thread functions for gcm enc
+ */
+struct gcm_context {
+	struct aes_context base;
+	uint8_t *key;
+	unsigned char *iv;
+	unsigned char *aad;
+	unsigned char *gcm_tag;
+	struct gcm_key_data gkey;
+	struct gcm_context_data gctx;
+};
+
+static int gcm_enc_pre(struct aes_context *p)
+{
+	uint8_t const IVend[] = GCM_IV_END_MARK;
+
+	struct gcm_context *pCtx = (struct gcm_context *)p;
+
+	pCtx->key = malloc(pCtx->base.bits / 8);
+	pCtx->iv = malloc(GCM_IV_LEN);
+	pCtx->gcm_tag = malloc(MAX_TAG_LEN);
+	pCtx->aad = malloc(AAD_LENGTH);
+
+	mk_rand_data(pCtx->aad, AAD_LENGTH);
+
+	mk_rand_data(pCtx->iv, GCM_IV_LEN);
+	memcpy(&pCtx->iv[GCM_IV_END_START], IVend, sizeof(IVend));
+
+	mk_rand_data(pCtx->key, pCtx->base.bits / 8);
+	if (pCtx->base.bits == 128)
+		aes_gcm_pre_128(pCtx->key, &pCtx->gkey);
+	else
+		aes_gcm_pre_256(pCtx->key, &pCtx->gkey);
+
+	return 0;
+}
+
+static void gcm_enc_post(struct aes_context *p)
+{
+	struct gcm_context *pCtx = (struct gcm_context *)p;
+
+	free(pCtx->key);
+	free(pCtx->iv);
+	free(pCtx->gcm_tag);
+	free(pCtx->aad);
+
+	return;
+}
+
+static void gcm_enc_proc(struct aes_context *p, char *plaintext, char *ciphertext,
+			 uint64_t len)
+{
+	struct gcm_context *pCtx = (struct gcm_context *)p;
+
+	if (pCtx->base.bits == 128)
+		aes_gcm_enc_128(&pCtx->gkey, &pCtx->gctx, ciphertext, plaintext, len, pCtx->iv,
+				pCtx->aad, AAD_LENGTH, pCtx->gcm_tag, MAX_TAG_LEN);
+	else if (pCtx->base.bits == 256)
+		aes_gcm_enc_256(&pCtx->gkey, &pCtx->gctx, ciphertext, plaintext, len, pCtx->iv,
+				pCtx->aad, AAD_LENGTH, pCtx->gcm_tag, MAX_TAG_LEN);
+	else {
+		printf("unsupported gcm encryption bits %d\n", pCtx->base.bits);
+		exit(1);
+	}
+
+	return;
+}
+
+void *gcm_128_enc_func(void *arg)
+{
+	int32_t id = *((int *)arg);
+	uint64_t round = -1;
+
+	struct gcm_context ctx =
+	    { {128, gcm_enc_pre, gcm_enc_proc, gcm_enc_post}, NULL, NULL, NULL, NULL, {0} };
+
+	round = aes_thread_func(id, &ctx.base);
+
+	pthread_exit((void *)round);
+}
+
+void *gcm_256_enc_func(void *arg)
+{
+	int32_t id = *((int *)arg);
+	uint64_t round = -1;
+
+	struct gcm_context ctx =
+	    { {256, gcm_enc_pre, gcm_enc_proc, gcm_enc_post}, NULL, NULL, NULL, NULL, {0} };
+
+	round = aes_thread_func(id, &ctx.base);
+
+	pthread_exit((void *)round);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.c b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.c
new file mode 100644
index 000000000..1263fea29
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.c
@@ -0,0 +1,206 @@
+/**
+ * @file isal_multithread_perf.c
+ * @brief It is used to verify high speed algorithm saturation issue
+ * @details
+ *	usage: taskset -c <cpu_index1,cpu_index2,...> isal_multithread_perf -m <algorithm name> -n <thread num>
+ *	eg: taskset -c 0-9,20-29 ./isal_multithread_perf -m md5_mb -n 10
+ */
+
+#include <pthread.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdarg.h>
+
+#include "isal_multithread_perf.h"
+
+alg_method algs[] = {
+	{"md5", md5_ossl_func, MD5_MAX_LANES}
+	,
+	{"md5_mb", md5_mb_func, MD5_MAX_LANES}
+	,
+	{"sha1", sha1_ossl_func, SHA1_MAX_LANES}
+	,
+	{"sha1_mb", sha1_mb_func, SHA1_MAX_LANES}
+	,
+	{"sha256", sha256_ossl_func, SHA256_MAX_LANES}
+	,
+	{"sha256_mb", sha256_mb_func, SHA256_MAX_LANES}
+	,
+	{"sha512", sha512_ossl_func, SHA512_MAX_LANES}
+	,
+	{"sha512_mb", sha512_mb_func, SHA512_MAX_LANES}
+	,
+	{"cbc_128_dec", cbc_128_dec_func, 1}
+	,
+	{"cbc_192_dec", cbc_192_dec_func, 1}
+	,
+	{"cbc_256_dec", cbc_256_dec_func, 1}
+	,
+	{"xts_128_enc", xts_128_enc_func, 1}
+	,
+	{"xts_256_enc", xts_256_enc_func, 1}
+	,
+	{"gcm_128_enc", gcm_128_enc_func, 1}
+	,
+	{"gcm_256_enc", gcm_256_enc_func, 1}
+	,
+
+	{NULL, NULL}
+};
+
+/* Global parameters*/
+long long run_secs = 10;
+uint32_t num_threads = 2;
+uint32_t buflen = 32 * 1024;
+uint32_t prememcpy = 0;
+uint32_t postmemcpy = 0;
+char *method = "md5_mb";
+
+/* Global thread sync */
+pthread_mutex_t count_lock = PTHREAD_MUTEX_INITIALIZER;
+pthread_cond_t count_cond = PTHREAD_COND_INITIALIZER;
+volatile uint32_t count = 0;
+
+int verbose = 0;
+
+void usage(char *appname)
+{
+	int i = 0;
+	printf("Usage: %s -n num_threads\n", appname);
+	printf("\t-v verbose output\n"
+	       "\t-t time to run(secs)\n"
+	       "\t-n number of algorithm threads\n"
+	       "\t-l len of each buffer(KB)\n"
+	       "\t-a memory copy before algorithm -- 1 do(default); 0 not do\n"
+	       "\t-b memory copy after algorithm -- 1 do(default); 0 not do\n"
+	       "\t-m method of algorithm:");
+	for (i = 0; algs[i].name != NULL; i++)
+		printf("  %s", algs[i].name);
+	printf("\n");
+
+}
+
+void notice(char *appname, alg_method * alg_choose_p)
+{
+	int i = 0;
+	printf("%s starts to run\n", appname);
+	printf("\tverbose output is %d\n"
+	       "\truntime is %lld(secs)\n"
+	       "\tnumber of algorithm threads is %d\n"
+	       "\tlen of each buffer(KB) is %d\n"
+	       "\tmemory copy before algorithm is %d\n"
+	       "\tmemory copy after algorithm is %d\n"
+	       "\tmethod of algorithm is %s\n", verbose, run_secs, num_threads, buflen / 1024,
+	       prememcpy, postmemcpy, alg_choose_p->name);
+}
+
+int main(int argc, char **argv)
+{
+	int i = 0;
+	int opt;
+	char *optstring = "t:n:m:l:a:b:v";
+	int32_t *id = NULL, ret = 0;
+	alg_method alg_choose;
+	pthread_t *clients = NULL;
+	uint64_t count = 0, sum = 0;
+	uint32_t rounds_buf;
+
+	while ((opt = getopt(argc, argv, optstring)) != -1) {
+		switch (opt) {
+		case 't':
+			run_secs = atol(optarg);
+			if (run_secs <= 0) {
+				usage(argv[0]);
+				exit(-1);
+			}
+			break;
+		case 'n':
+			num_threads = atoi(optarg);
+			if (num_threads <= 0) {
+				usage(argv[0]);
+				exit(-1);
+			}
+			break;
+		case 'm':
+			method = optarg;
+			break;
+		case 'l':
+			buflen = atoi(optarg) * 1024;
+			if (buflen <= 0) {
+				usage(argv[0]);
+				exit(-1);
+			}
+			break;
+		case 'a':
+			prememcpy = atoi(optarg);
+			if (prememcpy != 0 && prememcpy != 1) {
+				usage(argv[0]);
+				exit(-1);
+			}
+			break;
+		case 'b':
+			postmemcpy = atoi(optarg);
+			if (postmemcpy != 0 && postmemcpy != 1) {
+				usage(argv[0]);
+				exit(-1);
+			}
+			break;
+		case 'v':
+			verbose = 1;
+			break;
+		default:
+			usage(argv[0]);
+			exit(0);
+		}
+	}
+
+	/* Check method str and set algorithm_func */
+	for (i = 0; algs[i].name != NULL; i++) {
+		if (!strcmp(method, algs[i].name)) {
+			alg_choose = algs[i];
+			break;
+		}
+	}
+	if (algs[i].name == NULL) {
+		usage(argv[0]);
+		exit(-1);
+	}
+
+	notice(argv[0], &alg_choose);
+	rounds_buf = alg_choose.rounds_nbuf;
+
+	clients = (pthread_t *) calloc(num_threads + 1, sizeof(pthread_t));
+	id = (int32_t *) calloc(num_threads + 1, sizeof(int32_t));
+
+	printf("Start %i threads, use %s function\n", num_threads, alg_choose.name);
+
+	for (i = 0; i < num_threads; i++) {
+		id[i] = i;
+
+		ret =
+		    pthread_create(&clients[i], NULL, alg_choose.thread_func, (void *)&id[i]);
+
+		if (ret != 0) {
+			printf("Failed to create thread %i: %s", i, strerror(ret));
+			exit(-1);
+		}
+		printfv("Thread %i is created\n", i);
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		pthread_join(clients[i], (void *)&count);
+		sum += count;
+	}
+	double loop_unit = ((double)buflen) * rounds_buf / run_secs / 1024 / 1024;
+	printf("Sum of rounds is %ld\n"
+	       "Average throughput(MB/s) is %.2f\n"
+	       "Total throughput(MB/s) is %.2f\n",
+	       sum, (double)sum / i * loop_unit, (double)sum * loop_unit);
+
+	exit(0);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.h b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.h
new file mode 100644
index 000000000..4f38705dd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.h
@@ -0,0 +1,52 @@
+
+#ifndef ISAL_MULTITHREAD_PERF_H_
+#define ISAL_MULTITHREAD_PERF_H_
+
+#include "isa-l_crypto.h"
+
+/* multibuffer hash */
+void *md5_ossl_func(void *arg);
+void *md5_mb_func(void *arg);
+void *sha1_ossl_func(void *arg);
+void *sha1_mb_func(void *arg);
+void *sha256_ossl_func(void *arg);
+void *sha256_mb_func(void *arg);
+void *sha512_ossl_func(void *arg);
+void *sha512_mb_func(void *arg);
+
+/* aes */
+void *cbc_128_dec_func(void *arg);
+void *cbc_192_dec_func(void *arg);
+void *cbc_256_dec_func(void *arg);
+void *xts_128_enc_func(void *arg);
+void *xts_256_enc_func(void *arg);
+#define AAD_LENGTH   16
+void *gcm_128_enc_func(void *arg);
+void *gcm_256_enc_func(void *arg);
+
+
+typedef struct {
+	char *name;
+	void *(*thread_func) (void *arg);
+	uint32_t rounds_nbuf;	/* bufs number of one processing round */
+} alg_method;
+
+
+/* Global parameters*/
+extern long long run_secs;
+extern uint32_t num_threads;
+extern uint32_t buflen;
+extern uint32_t prememcpy;
+extern uint32_t postmemcpy;
+
+extern pthread_mutex_t count_lock;
+extern pthread_cond_t count_cond;
+extern volatile uint32_t count;
+
+extern int verbose;
+#define printfv(format, args...) { \
+	if (verbose) \
+		printf (format, ##args); \
+}
+
+#endif /* ISAL_MULTITHREAD_PERF_H_ */
diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/md5_thread.c b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/md5_thread.c
new file mode 100644
index 000000000..f63b3785b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/md5_thread.c
@@ -0,0 +1,213 @@
+
+#include <pthread.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <openssl/md5.h>
+#include <openssl/sha.h>
+
+#include "isal_multithread_perf.h"
+
+#ifndef HASH_THREAD
+/* MD5 related params and structures*/
+#define DIGEST_NWORDS   MD5_DIGEST_NWORDS
+#define MB_BUFS         MD5_MAX_LANES
+#define HASH_CTX_MGR    MD5_HASH_CTX_MGR
+#define HASH_CTX	MD5_HASH_CTX
+
+#define OSSL_THREAD_FUNC	md5_ossl_func
+#define OSSL_HASH_FUNC		MD5
+#define MB_THREAD_FUNC		md5_mb_func
+#define CTX_MGR_INIT		md5_ctx_mgr_init
+#define CTX_MGR_SUBMIT		md5_ctx_mgr_submit
+#define CTX_MGR_FLUSH		md5_ctx_mgr_flush
+
+#define rounds_buf MD5_MAX_LANES
+
+#endif // HASH_THREAD
+
+typedef uint32_t hash_digests[DIGEST_NWORDS];
+
+void *OSSL_THREAD_FUNC(void *arg)
+{
+	int32_t id = *((int *)arg);
+	uint32_t i = 0, j = 0;
+	char *hash_buf[rounds_buf] = { NULL };	/* hash buf is used to do hash compute */
+	char *carry_buf[rounds_buf] = { NULL };	/* carry buf is used to do memory movement */
+	hash_digests digest;
+	uint64_t round = -1;
+	struct timeval start_tv, stop_tv;
+	long long secs = run_secs;
+
+	printfv("Thread %i is started\n", id);
+	/* memory allocate */
+	for (j = 0; j < rounds_buf; j++) {
+		carry_buf[j] = (char *)calloc((size_t)buflen, 1);
+		if (carry_buf[j] == NULL) {
+			printf("calloc failed test aborted\n");
+			goto out;
+		}
+
+		hash_buf[j] = (char *)calloc((size_t)buflen, 1);
+		if (hash_buf[j] == NULL) {
+			printf("calloc failed test aborted\n");
+			goto out;
+		}
+
+		/* Create the random data */
+		for (i = 0; i < buflen; i += 1024) {
+			carry_buf[j][i] = i % 256;
+			hash_buf[j][i] = i % 256;
+		}
+	}
+
+	/* Thread sync */
+	pthread_mutex_lock(&count_lock);
+	count++;
+	if (count == num_threads) {
+		pthread_cond_broadcast(&count_cond);
+	} else {
+		pthread_cond_wait(&count_cond, &count_lock);
+	}
+	pthread_mutex_unlock(&count_lock);
+
+	printfv("Thread %i is ready\n", id);
+	/* hash func starts to run */
+	round = 0;
+	gettimeofday(&start_tv, 0);
+	gettimeofday(&stop_tv, 0);
+	while (secs > (stop_tv.tv_sec - start_tv.tv_sec)) {
+		for (j = 0; j < rounds_buf; j++) {
+			/* Pre mem-operation */
+			if (prememcpy)
+				memcpy(hash_buf[j], carry_buf[j], buflen);
+
+			/* Calculate hash digest */
+			OSSL_HASH_FUNC((char *)hash_buf[j], buflen, (unsigned char *)&digest);
+
+			/* Post mem-operation */
+			if (postmemcpy)
+				memcpy(carry_buf[j], hash_buf[j], buflen);
+		}
+		round++;
+
+		gettimeofday(&stop_tv, 0);
+	}
+	printfv("thread %2i, openssl_func rounds %ld\n", id, round);
+
+      out:
+	for (j = 0; j < rounds_buf; j++) {
+		free(carry_buf[j]);
+		free(hash_buf[j]);
+	}
+
+	pthread_exit((void *)round);
+}
+
+void *MB_THREAD_FUNC(void *arg)
+{
+	int32_t id = *((int *)arg);
+	uint32_t i = 0, j = 0;
+	char *hash_buf[rounds_buf] = { NULL };	/* hash buf is used to do hash compute */
+	char *carry_buf[rounds_buf] = { NULL };	/* carry buf is used to do memory movement */
+	hash_digests *digests[rounds_buf];
+	uint64_t round = -1;
+	struct timeval start_tv, stop_tv;
+	long long secs = run_secs;
+	int ret;
+
+	HASH_CTX_MGR *mgr = NULL;
+	HASH_CTX *ctxpool = NULL, *ctx = NULL;
+
+	printfv("Thread %i is started\n", id);
+	/* Memory allocate */
+	for (j = 0; j < rounds_buf; j++) {
+		carry_buf[j] = (char *)calloc((size_t)buflen, 1);
+		if (carry_buf[j] == NULL) {
+			printf("calloc failed test aborted\n");
+			goto out;
+		}
+
+		hash_buf[j] = (char *)calloc((size_t)buflen, 1);
+		if (hash_buf[j] == NULL) {
+			printf("calloc failed test aborted\n");
+			goto out;
+		}
+
+		digests[j] = (hash_digests *) calloc(sizeof(hash_digests), 1);
+
+		/* Create the random data */
+		for (i = 0; i < buflen; i += 1024) {
+			carry_buf[j][i] = i % 256;
+			hash_buf[j][i] = i % 256;
+		}
+	}
+
+	ctxpool = (HASH_CTX *) calloc(rounds_buf, sizeof(HASH_CTX));
+	for (i = 0; i < rounds_buf; i++) {
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+	ret = posix_memalign((void *)&mgr, 16, sizeof(HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		goto out;
+	}
+	CTX_MGR_INIT(mgr);
+
+	printfv("Thread %i gets to wait\n", id);
+	/* Thread sync */
+	pthread_mutex_lock(&count_lock);
+	count++;
+	if (count == num_threads) {
+		pthread_cond_broadcast(&count_cond);
+	} else {
+		pthread_cond_wait(&count_cond, &count_lock);
+	}
+	pthread_mutex_unlock(&count_lock);
+
+	printfv("Thread %i is ready\n", id);
+	/* hash func starts to run */
+	round = 0;
+	gettimeofday(&start_tv, 0);
+	gettimeofday(&stop_tv, 0);
+	while (secs > (stop_tv.tv_sec - start_tv.tv_sec)) {
+		for (j = 0; j < rounds_buf; j += MB_BUFS) {
+			for (i = 0; i < MB_BUFS; i++) {
+				/* Pre mem-operation */
+				if (prememcpy)
+					memcpy(hash_buf[j + i], carry_buf[j + i], buflen);
+
+				CTX_MGR_SUBMIT(mgr, &ctxpool[j + i], hash_buf[j + i], buflen,
+					       HASH_ENTIRE);
+			}
+
+			/* Calculate hash digest */
+			while (CTX_MGR_FLUSH(mgr)) ;
+			for (i = 0; i < MB_BUFS; i++) {
+				/* Post mem-operation */
+				if (postmemcpy)
+					memcpy(carry_buf[j + i], hash_buf[j + i], buflen);
+			}
+		}
+		round++;
+
+		gettimeofday(&stop_tv, 0);
+	}
+	printfv("thread %2i, multibuffer_func rounds %ld\n", id, round);
+
+      out:
+	free(ctxpool);
+	free(mgr);
+	for (j = 0; j < rounds_buf; j++) {
+		free(carry_buf[j]);
+		free(digests[j]);
+		free(hash_buf[j]);
+	}
+
+	pthread_exit((void *)round);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha1_thread.c b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha1_thread.c
new file mode 100644
index 000000000..5ec7eb04a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha1_thread.c
@@ -0,0 +1,20 @@
+
+#define HASH_THREAD
+/* sha1 related params and structures*/
+#define DIGEST_NWORDS   SHA1_DIGEST_NWORDS
+#define MB_BUFS         SHA1_MAX_LANES
+#define HASH_CTX_MGR    SHA1_HASH_CTX_MGR
+#define HASH_CTX	SHA1_HASH_CTX
+
+#define OSSL_THREAD_FUNC	sha1_ossl_func
+#define OSSL_HASH_FUNC		SHA1
+#define MB_THREAD_FUNC		sha1_mb_func
+#define CTX_MGR_INIT		sha1_ctx_mgr_init
+#define CTX_MGR_SUBMIT		sha1_ctx_mgr_submit
+#define CTX_MGR_FLUSH		sha1_ctx_mgr_flush
+
+#define rounds_buf SHA1_MAX_LANES
+
+#include "md5_thread.c"
+
+#undef HASH_THREAD
diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha256_thread.c b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha256_thread.c
new file mode 100644
index 000000000..c155c19d4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha256_thread.c
@@ -0,0 +1,20 @@
+
+#define HASH_THREAD
+/* sha256 related params and structures*/
+#define DIGEST_NWORDS   SHA256_DIGEST_NWORDS
+#define MB_BUFS         SHA256_MAX_LANES
+#define HASH_CTX_MGR    SHA256_HASH_CTX_MGR
+#define HASH_CTX	SHA256_HASH_CTX
+
+#define OSSL_THREAD_FUNC	sha256_ossl_func
+#define OSSL_HASH_FUNC		SHA256
+#define MB_THREAD_FUNC		sha256_mb_func
+#define CTX_MGR_INIT		sha256_ctx_mgr_init
+#define CTX_MGR_SUBMIT		sha256_ctx_mgr_submit
+#define CTX_MGR_FLUSH		sha256_ctx_mgr_flush
+
+#define rounds_buf SHA256_MAX_LANES
+
+#include "md5_thread.c"
+
+#undef HASH_THREAD
diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha512_thread.c b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha512_thread.c
new file mode 100644
index 000000000..5861835a8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha512_thread.c
@@ -0,0 +1,20 @@
+
+#define HASH_THREAD
+/* sha512 related params and structures*/
+#define DIGEST_NWORDS   (SHA512_DIGEST_NWORDS * 2)
+#define MB_BUFS         SHA512_MAX_LANES
+#define HASH_CTX_MGR    SHA512_HASH_CTX_MGR
+#define HASH_CTX	SHA512_HASH_CTX
+
+#define OSSL_THREAD_FUNC	sha512_ossl_func
+#define OSSL_HASH_FUNC		SHA512
+#define MB_THREAD_FUNC		sha512_mb_func
+#define CTX_MGR_INIT		sha512_ctx_mgr_init
+#define CTX_MGR_SUBMIT		sha512_ctx_mgr_submit
+#define CTX_MGR_FLUSH		sha512_ctx_mgr_flush
+
+#define rounds_buf SHA512_MAX_LANES
+
+#include "md5_thread.c"
+
+#undef HASH_THREAD
diff --git a/src/crypto/isa-l/isa-l_crypto/include/aarch64_multibinary.h b/src/crypto/isa-l/isa-l_crypto/include/aarch64_multibinary.h
new file mode 100644
index 000000000..a8f81b232
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/aarch64_multibinary.h
@@ -0,0 +1,301 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#ifndef __AARCH64_MULTIBINARY_H__
+#define __AARCH64_MULTIBINARY_H__
+#ifndef __aarch64__
+#error "This file is for aarch64 only"
+#endif
+#include <asm/hwcap.h>
+#ifdef __ASSEMBLY__
+/**
+ * # mbin_interface : the wrapper layer for isal-l api
+ *
+ * ## references:
+ * * https://sourceware.org/git/gitweb.cgi?p=glibc.git;a=blob;f=sysdeps/aarch64/dl-trampoline.S
+ * * http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf
+ * * https://static.docs.arm.com/ihi0057/b/IHI0057B_aadwarf64.pdf?_ga=2.80574487.1870739014.1564969896-1634778941.1548729310
+ *
+ * ## Usage:
+ * 	1. Define dispather function
+ * 	2. name must be \name\()_dispatcher
+ * 	3. Prototype should be *"void * \name\()_dispatcher"*
+ * 	4. The dispather should return the right function pointer , revision and a string information .
+ **/
+.macro mbin_interface name:req
+	.extern \name\()_dispatcher
+	.section        .data
+	.balign 8
+	.global \name\()_dispatcher_info
+	.type   \name\()_dispatcher_info,%object
+
+	\name\()_dispatcher_info:
+		.quad   \name\()_mbinit         //func_entry
+
+	.size   \name\()_dispatcher_info,. - \name\()_dispatcher_info
+
+	.balign 8
+	.text
+	\name\()_mbinit:
+		//save lp fp, sub sp
+		.cfi_startproc
+		stp     x29, x30, [sp, -224]!
+
+		//add cfi directive to avoid GDB bt cmds error
+		//set cfi(Call Frame Information)
+		.cfi_def_cfa_offset 224
+		.cfi_offset 	    29, -224
+		.cfi_offset 	    30, -216
+
+		//save parameter/result/indirect result registers
+		stp	x8,  x9,  [sp,   16]
+		.cfi_offset 	    8, -208
+		.cfi_offset 	    9, -200
+		stp	x0,  x1,  [sp,   32]
+		.cfi_offset 	    0, -192
+		.cfi_offset 	    1, -184
+		stp	x2,  x3,  [sp,   48]
+		.cfi_offset 	    2, -176
+		.cfi_offset 	    3, -168
+		stp	x4,  x5,  [sp,   64]
+		.cfi_offset 	    4, -160
+		.cfi_offset 	    5, -152
+		stp	x6,  x7,  [sp,   80]
+		.cfi_offset 	    6, -144
+		.cfi_offset 	    7, -136
+		stp	q0,  q1,  [sp,   96]
+		.cfi_offset 	   64, -128
+		.cfi_offset 	   65, -112
+		stp	q2,  q3,  [sp,  128]
+		.cfi_offset 	   66,  -96
+		.cfi_offset 	   67,  -80
+		stp	q4,  q5,  [sp,  160]
+		.cfi_offset 	   68,  -64
+		.cfi_offset 	   69,  -48
+		stp	q6,  q7,  [sp,  192]
+		.cfi_offset 	   70,  -32
+		.cfi_offset 	   71,  -16
+
+		/**
+		 * The dispatcher functions have the following prototype:
+		 * 	void * function_dispatcher(void)
+		 * As the dispatcher is returning a struct, by the AAPCS,
+		 */
+
+
+		bl \name\()_dispatcher
+		//restore temp/indirect result registers
+		ldp	x8,  x9,  [sp,    16]
+		.cfi_restore 8
+		.cfi_restore 9
+
+		//	save function entry
+		str	x0,  [x9]
+
+		//restore parameter/result registers
+		ldp	x0,  x1,  [sp,    32]
+		.cfi_restore 0
+		.cfi_restore 1
+		ldp	x2,  x3,  [sp,    48]
+		.cfi_restore 2
+		.cfi_restore 3
+		ldp	x4,  x5,  [sp,    64]
+		.cfi_restore 4
+		.cfi_restore 5
+		ldp	x6,  x7,  [sp,    80]
+		.cfi_restore 6
+		.cfi_restore 7
+		ldp	q0,  q1,  [sp,    96]
+		.cfi_restore 64
+		.cfi_restore 65
+		ldp	q2,  q3,  [sp,   128]
+		.cfi_restore 66
+		.cfi_restore 67
+		ldp	q4,  q5,  [sp,   160]
+		.cfi_restore 68
+		.cfi_restore 69
+		ldp	q6,  q7,  [sp,   192]
+		.cfi_restore 70
+		.cfi_restore 71
+		//save lp fp and sp
+		ldp     x29, x30, [sp], 224
+		//restore cfi setting
+		.cfi_restore 30
+		.cfi_restore 29
+		.cfi_def_cfa_offset 0
+		.cfi_endproc
+
+	.global \name
+	.type \name,%function
+	.align  2
+	\name\():
+		adrp    x9, :got:\name\()_dispatcher_info
+		ldr     x9, [x9, #:got_lo12:\name\()_dispatcher_info]
+		ldr     x10,[x9]
+		br      x10
+	.size \name,. - \name
+
+.endm
+
+/**
+ * mbin_interface_base is used for the interfaces which have only
+ * noarch implementation
+ */
+.macro mbin_interface_base name:req, base:req
+	.extern \base
+	.section        .data
+	.balign 8
+	.global \name\()_dispatcher_info
+	.type   \name\()_dispatcher_info,%object
+
+	\name\()_dispatcher_info:
+		.quad   \base         //func_entry
+	.size   \name\()_dispatcher_info,. - \name\()_dispatcher_info
+
+	.balign 8
+	.text
+	.global \name
+	.type \name,%function
+	.align  2
+	\name\():
+		adrp    x9, :got:\name\()_dispatcher_info
+		ldr     x9, [x9, #:got_lo12:\name\()_dispatcher_info]
+		ldr     x10,[x9]
+		br      x10
+	.size \name,. - \name
+
+.endm
+
+#else /* __ASSEMBLY__ */
+#include <sys/auxv.h>
+
+
+
+#define DEFINE_INTERFACE_DISPATCHER(name)                               \
+	void * name##_dispatcher(void)
+
+#define PROVIDER_BASIC(name)                                            \
+	PROVIDER_INFO(name##_base)
+
+#define DO_DIGNOSTIC(x)	_Pragma GCC diagnostic ignored "-W"#x
+#define DO_PRAGMA(x) _Pragma (#x)
+#define DIGNOSTIC_IGNORE(x) DO_PRAGMA(GCC diagnostic ignored #x)
+#define DIGNOSTIC_PUSH()	DO_PRAGMA(GCC diagnostic push)
+#define DIGNOSTIC_POP()		DO_PRAGMA(GCC diagnostic pop)
+
+
+#define PROVIDER_INFO(_func_entry)                                  	\
+	({	DIGNOSTIC_PUSH()					\
+		DIGNOSTIC_IGNORE(-Wnested-externs)			\
+		extern void  _func_entry(void);				\
+		DIGNOSTIC_POP()						\
+		_func_entry;						\
+	})
+
+/**
+ * Micro-Architector definitions
+ * Reference: https://developer.arm.com/docs/ddi0595/f/aarch64-system-registers/midr_el1
+ */
+
+#define CPU_IMPLEMENTER_RESERVE			0x00
+#define CPU_IMPLEMENTER_ARM			0x41
+
+
+#define CPU_PART_CORTEX_A57		0xD07
+#define CPU_PART_CORTEX_A72		0xD08
+#define CPU_PART_NEOVERSE_N1		0xD0C
+
+#define MICRO_ARCH_ID(imp,part)	\
+	(((CPU_IMPLEMENTER_##imp&0xff)<<24)|((CPU_PART_##part&0xfff)<<4))
+
+#ifndef HWCAP_CPUID
+#define HWCAP_CPUID (1<<11)
+#endif
+
+/**
+ * @brief  get_micro_arch_id
+ * read micro-architector register instruction if possible.This function
+ * provides microarchitecture information and make microarchitecture optimization
+ * possible. It will trap into kernel due to mrs instruction. So it should
+ * be called only in dispatcher, that will be called only once in program
+ * lifecycle. And HWCAP must be match,That will make sure there are no
+ * illegal instruction errors.
+ *
+ * NOTICE:
+ *     - HWCAP_CPUID should be available. Otherwise it returns zero
+ *     - It MUST be called inside dispather.
+ *     - It MUST meet the HWCAP requirements
+ *
+ * Example:
+ *      DEFINE_INTERFACE_DISPATCHER(crc32_iscsi)
+ *      {
+ *              unsigned long auxval = getauxval(AT_HWCAP);
+ *              // MUST do the judgement is MUST.
+ *              if ((HWCAP_CRC32 | HWCAP_PMULL) == (auxval & (HWCAP_CRC32 | HWCAP_PMULL))) {
+ *                      switch (get_micro_arch_id()) {
+ *                      case MICRO_ARCH_ID(ARM, CORTEX_A57):
+ *                              return PROVIDER_INFO(crc32_pmull_crc_for_a57);
+ *                      case MICRO_ARCH_ID(ARM, CORTEX_A72):
+ *                              return PROVIDER_INFO(crc32_pmull_crc_for_a72);
+ *                      case MICRO_ARCH_ID(ARM, NEOVERSE_N1):
+ *                              return PROVIDER_INFO(crc32_pmull_crc_for_n1);
+ *                      case default:
+ *                              return PROVIDER_INFO(crc32_pmull_crc_for_others);
+ *                      }
+ *              }
+ *              return PROVIDER_BASIC(crc32_iscsi);
+ *      }
+ * KNOWN ISSUE:
+ *   On a heterogeneous system (big.LITTLE), it will work but the performance
+ *   might not be the best one as expected.
+ *
+ *   If this function is called on the big core, it will return the function
+ *   optimized for the big core.
+ *
+ *   If execution is then scheduled to the little core. It will still work (1),
+ *   but the function won't be optimized for the little core, thus the performance
+ *   won't be as expected.
+ *
+ * References:
+ * -  [CPU Feature detection](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/arm64/cpu-feature-registers.rst?h=v5.5)
+ *
+ */
+static inline uint32_t get_micro_arch_id(void)
+{
+	uint32_t id=CPU_IMPLEMENTER_RESERVE;
+	if ((getauxval(AT_HWCAP) & HWCAP_CPUID)) {
+
+		asm("mrs %0, MIDR_EL1 " : "=r" (id));
+	}
+	return id&0xff00fff0;
+}
+
+
+
+#endif /* __ASSEMBLY__ */
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/include/aes_cbc.h b/src/crypto/isa-l/isa-l_crypto/include/aes_cbc.h
new file mode 100644
index 000000000..aaf87ada1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/aes_cbc.h
@@ -0,0 +1,165 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/**
+ *  @file aes_cbc.h
+ *  @brief AES CBC encryption/decryption function prototypes.
+ *
+ */
+#ifndef _AES_CBC_h
+#define _AES_CBC_h
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+
+#endif
+
+typedef enum cbc_key_size { CBC_128_BITS = 16, CBC_192_BITS = 24, CBC_256_BITS = 32} cbc_key_size;
+#define CBC_ROUND_KEY_LEN	(16)
+#define CBC_128_KEY_ROUNDS 	(10+1) /*expanded key holds 10 key rounds plus original key*/
+#define CBC_192_KEY_ROUNDS 	(12+1) /*expanded key holds 12 key rounds plus original key*/
+#define CBC_256_KEY_ROUNDS 	(14+1) /*expanded key holds 14 key rounds plus original key*/
+#define CBC_MAX_KEYS_SIZE  	(CBC_ROUND_KEY_LEN * CBC_256_KEY_ROUNDS)
+
+#define CBC_IV_DATA_LEN (16)
+
+/** @brief holds intermediate key data used in encryption/decryption
+ *
+ */
+struct cbc_key_data { // must be 16 byte aligned
+	uint8_t enc_keys[CBC_MAX_KEYS_SIZE];
+	uint8_t dec_keys[CBC_MAX_KEYS_SIZE];
+};
+
+/** @brief CBC-AES key pre-computation done once for a key
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ * arg 1: in:   pointer to key
+ * arg 2: OUT:  pointer to a key expanded data
+ */
+int aes_cbc_precomp(
+		uint8_t             *key,
+		int                  key_size,
+		struct cbc_key_data *keys_blk
+);
+
+/** @brief CBC-AES 128 bit key Decryption
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ * arg 1: in:   pointer to input (cipher text)
+ * arg 2: IV:   pointer to IV, Must be 16 bytes aligned to a 16 byte boundary
+ * arg 3: keys: pointer to keys, Must be on a 16 byte boundary and length of key size * key rounds
+ * arg 4: OUT:  pointer to output (plain text ... in-place allowed)
+ * arg 5: len_bytes:  length in bytes (multiple of 16)
+ */
+void aes_cbc_dec_128(
+	void     *in,        //!< Input cipher text
+	uint8_t  *IV,        //!< Must be 16 bytes aligned to a 16 byte boundary
+	uint8_t  *keys,      //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data
+	void     *out,       //!< Output plain text
+	uint64_t len_bytes   //!< Must be a multiple of 16 bytes
+	);
+
+/** @brief CBC-AES 192 bit key Decryption
+ *
+* @requires SSE4.1 and AESNI
+*
+*/
+void aes_cbc_dec_192(
+	void     *in,        //!< Input cipher text
+	uint8_t  *IV,        //!< Must be 16 bytes aligned to a 16 byte boundary
+	uint8_t  *keys,      //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data
+	void     *out,       //!< Output plain text
+	uint64_t len_bytes   //!< Must be a multiple of 16 bytes
+	);
+
+/** @brief CBC-AES 256 bit key Decryption
+ *
+* @requires SSE4.1 and AESNI
+*
+*/
+void aes_cbc_dec_256(
+	void     *in,        //!< Input cipher text
+	uint8_t  *IV,        //!< Must be 16 bytes aligned to a 16 byte boundary
+	uint8_t  *keys,      //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data
+	void     *out,       //!< Output plain text
+	uint64_t len_bytes   //!< Must be a multiple of 16 bytes
+	);
+
+/** @brief CBC-AES 128 bit key Encryption
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ * arg 1: in:   pointer to input (plain text)
+ * arg 2: IV:   pointer to IV, Must be 16 bytes aligned to a 16 byte boundary
+ * arg 3: keys: pointer to keys, Must be on a 16 byte boundary and length of key size * key rounds
+ * arg 4: OUT:  pointer to output (cipher text ... in-place allowed)
+ * arg 5: len_bytes:  length in bytes (multiple of 16)
+ */
+int aes_cbc_enc_128(
+	void     *in,        //!< Input plain text
+	uint8_t  *IV,        //!< Must be 16 bytes aligned to a 16 byte boundary
+	uint8_t  *keys,      //!< Must be on a 16 byte boundary and length of key size * key rounds or enc_keys of cbc_key_data
+	void     *out,       //!< Output cipher text
+	uint64_t len_bytes   //!< Must be a multiple of 16 bytes
+	);
+/** @brief CBC-AES 192 bit key Encryption
+ *
+* @requires SSE4.1 and AESNI
+*
+*/
+int aes_cbc_enc_192(
+	void     *in,        //!< Input plain text
+	uint8_t  *IV,        //!< Must be 16 bytes aligned to a 16 byte boundary
+	uint8_t  *keys,      //!< Must be on a 16 byte boundary and length of key size * key rounds or enc_keys of cbc_key_data
+	void     *out,       //!< Output cipher text
+	uint64_t len_bytes   //!< Must be a multiple of 16 bytes
+	);
+
+/** @brief CBC-AES 256 bit key Encryption
+ *
+* @requires SSE4.1 and AESNI
+*
+*/
+int aes_cbc_enc_256(
+	void     *in,        //!< Input plain text
+	uint8_t  *IV,        //!< Must be 16 bytes aligned to a 16 byte boundary
+	uint8_t  *keys,      //!< Must be on a 16 byte boundary and length of key size * key rounds or enc_keys of cbc_key_data
+	void     *out,       //!< Output cipher text
+	uint64_t len_bytes   //!< Must be a multiple of 16 bytes
+	);
+
+#ifdef __cplusplus
+}
+#endif				//__cplusplus
+#endif				//ifndef _AES_CBC_h
diff --git a/src/crypto/isa-l/isa-l_crypto/include/aes_gcm.h b/src/crypto/isa-l/isa-l_crypto/include/aes_gcm.h
new file mode 100644
index 000000000..b407b7f6b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/aes_gcm.h
@@ -0,0 +1,613 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/**
+ *  @file aes_gcm.h
+ *  @brief AES GCM encryption/decryption function prototypes.
+ *
+ * At build time there is an option to use non-temporal loads and stores
+ * selected by defining the compile time option NT_LDST. The use of this option
+ * places the following restriction on the gcm encryption functions:
+ *
+ * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary.
+ *
+ * - When using the streaming API, all partial input buffers must be a multiple
+ *   of 64 bytes long except for the last input buffer.
+ *
+ * - In-place encryption/decryption is not recommended.
+ *
+ */
+
+/*
+; References:
+;       This code was derived and highly optimized from the code described in paper:
+;               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+;       For the shift-based reductions used in this code, we used the method described in paper:
+;               Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+; Assumptions: Support for SSE4.1 or greater, AVX or AVX2
+;
+;
+; iv:
+;       0                   1                   2                   3
+;       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                             Salt  (From the SA)               |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                     Initialization Vector                     |
+;       |         (This is the sequence number from IPSec header)       |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;       |                              0x1                              |
+;       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; TLen:
+;       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+ */
+#ifndef _AES_GCM_h
+#define _AES_GCM_h
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+#define MAX_TAG_LEN (16)
+//
+// IV data is limited to 16 bytes. The last DWORD (4 bytes) must be 0x1
+//
+#define GCM_IV_LEN (16)
+#define GCM_IV_DATA_LEN (12)
+#define GCM_IV_END_MARK {0x00, 0x00, 0x00, 0x01};
+#define GCM_IV_END_START (12)
+
+#define LONGEST_TESTED_AAD_LENGTH (2* 1024)
+
+// Key lengths of 128 and 256 supported
+#define GCM_128_KEY_LEN (16)
+#define GCM_256_KEY_LEN (32)
+
+#define GCM_BLOCK_LEN  16
+#define GCM_ENC_KEY_LEN  16
+#define GCM_KEY_SETS (15) /*exp key + 14 exp round keys*/
+
+/**
+ * @brief holds intermediate key data needed to improve performance
+ *
+ * gcm_data hold internal key information used by gcm128 and gcm256.
+ */
+struct gcm_data {
+	uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS];
+	uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN];  // store HashKey <<1 mod poly here
+	uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN];  // store HashKey^2 <<1 mod poly here
+	uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN];  // store HashKey^3 <<1 mod poly here
+	uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN];  // store HashKey^4 <<1 mod poly here
+	uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN];  // store HashKey^5 <<1 mod poly here
+	uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN];  // store HashKey^6 <<1 mod poly here
+	uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN];  // store HashKey^7 <<1 mod poly here
+	uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN];  // store HashKey^8 <<1 mod poly here
+	uint8_t shifted_hkey_1_k[GCM_ENC_KEY_LEN];  // store XOR of High 64 bits and Low 64 bits of  HashKey <<1 mod poly here (for Karatsuba purposes)
+	uint8_t shifted_hkey_2_k[GCM_ENC_KEY_LEN];  // store XOR of High 64 bits and Low 64 bits of  HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+	uint8_t shifted_hkey_3_k[GCM_ENC_KEY_LEN];  // store XOR of High 64 bits and Low 64 bits of  HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+	uint8_t shifted_hkey_4_k[GCM_ENC_KEY_LEN];  // store XOR of High 64 bits and Low 64 bits of  HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+	uint8_t shifted_hkey_5_k[GCM_ENC_KEY_LEN];  // store XOR of High 64 bits and Low 64 bits of  HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+	uint8_t shifted_hkey_6_k[GCM_ENC_KEY_LEN];  // store XOR of High 64 bits and Low 64 bits of  HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+	uint8_t shifted_hkey_7_k[GCM_ENC_KEY_LEN];  // store XOR of High 64 bits and Low 64 bits of  HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+	uint8_t shifted_hkey_8_k[GCM_ENC_KEY_LEN];  // store XOR of High 64 bits and Low 64 bits of  HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+	// init, update and finalize context data
+	uint8_t  aad_hash[GCM_BLOCK_LEN];
+	uint64_t aad_length;
+	uint64_t in_length;
+	uint8_t  partial_block_enc_key[GCM_BLOCK_LEN];
+	uint8_t  orig_IV[GCM_BLOCK_LEN];
+	uint8_t  current_counter[GCM_BLOCK_LEN];
+	uint64_t  partial_block_length;
+};
+
+/**
+ * @brief holds intermediate key data needed to improve performance
+ *
+ * gcm_key_data hold internal key information used by gcm128, gcm192 and gcm256.
+ */
+#ifdef __WIN32
+__declspec(align(16))
+#endif /* WIN32 */
+struct gcm_key_data {
+        uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS];
+        uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN];  // store HashKey <<1 mod poly here
+        uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN];  // store HashKey^2 <<1 mod poly here
+        uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN];  // store HashKey^3 <<1 mod poly here
+        uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN];  // store HashKey^4 <<1 mod poly here
+        uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN];  // store HashKey^5 <<1 mod poly here
+        uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN];  // store HashKey^6 <<1 mod poly here
+        uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN];  // store HashKey^7 <<1 mod poly here
+        uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN];  // store HashKey^8 <<1 mod poly here
+        uint8_t shifted_hkey_1_k[GCM_ENC_KEY_LEN];  // store XOR of High 64 bits
+        uint8_t shifted_hkey_2_k[GCM_ENC_KEY_LEN];  // and Low 64b of HashKey^n <<1 mod poly
+        uint8_t shifted_hkey_3_k[GCM_ENC_KEY_LEN];  // here (for Karatsuba purposes)
+        uint8_t shifted_hkey_4_k[GCM_ENC_KEY_LEN];
+        uint8_t shifted_hkey_5_k[GCM_ENC_KEY_LEN];
+        uint8_t shifted_hkey_6_k[GCM_ENC_KEY_LEN];
+        uint8_t shifted_hkey_7_k[GCM_ENC_KEY_LEN];
+        uint8_t shifted_hkey_8_k[GCM_ENC_KEY_LEN];
+#ifdef GCM_BIG_DATA
+        uint8_t shifted_hkey_n_k[GCM_ENC_KEY_LEN * (128 - 16)]; // Big data version needs 128
+#else
+        uint8_t shifted_hkey_n_k[GCM_ENC_KEY_LEN * (48 - 16)]; // Others vaes version needs 48
+#endif
+}
+#if defined (__unix__) || (__APPLE__) || (__MINGW32__)
+        __attribute__ ((aligned (16)));
+#else
+        ;
+#endif
+
+/**
+ * @brief holds GCM operation context
+ */
+struct gcm_context_data {
+        // init, update and finalize context data
+        uint8_t  aad_hash[GCM_BLOCK_LEN];
+        uint64_t aad_length;
+        uint64_t in_length;
+        uint8_t  partial_block_enc_key[GCM_BLOCK_LEN];
+        uint8_t  orig_IV[GCM_BLOCK_LEN];
+        uint8_t  current_counter[GCM_BLOCK_LEN];
+        uint64_t  partial_block_length;
+};
+
+/* ------------------ New interface for separate expanded keys ------------ */
+
+/**
+ * @brief GCM-AES Encryption using 128 bit keys
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_128(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *out,         //!< Ciphertext output. Encrypt in-place is allowed
+	uint8_t const *in,    //!< Plaintext input
+	uint64_t len,         //!< Length of data in Bytes for encryption
+	uint8_t *iv,          //!< iv pointer to 12 byte IV structure.
+	                      //!< Internally, library concates 0x00000001 value to it.
+	uint8_t const *aad,   //!< Additional Authentication Data (AAD)
+	uint64_t aad_len,     //!< Length of AAD
+	uint8_t *auth_tag,    //!< Authenticated Tag output
+	uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+	                      //!< Valid values are 16 (most likely), 12 or 8
+	);
+
+/**
+ * @brief GCM-AES Encryption using 256 bit keys
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_256(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *out,         //!< Ciphertext output. Encrypt in-place is allowed
+	uint8_t const *in,    //!< Plaintext input
+	uint64_t len,         //!< Length of data in Bytes for encryption
+	uint8_t *iv,          //!< iv pointer to 12 byte IV structure.
+	                      //!< Internally, library concates 0x00000001 value to it.
+	uint8_t const *aad,   //!< Additional Authentication Data (AAD)
+	uint64_t aad_len,     //!< Length of AAD
+	uint8_t *auth_tag,    //!< Authenticated Tag output
+	uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+	                      //!< Valid values are 16 (most likely), 12 or 8
+	);
+
+
+/**
+ * @brief GCM-AES Decryption using 128 bit keys
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_128(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *out,         //!< Plaintext output. Decrypt in-place is allowed
+	uint8_t const *in,    //!< Ciphertext input
+	uint64_t len,         //!< Length of data in Bytes for decryption
+	uint8_t *iv,          //!< iv pointer to 12 byte IV structure.
+	                      //!< Internally, library concates 0x00000001 value to it.
+	uint8_t const *aad,   //!< Additional Authentication Data (AAD)
+	uint64_t aad_len,     //!< Length of AAD
+	uint8_t *auth_tag,    //!< Authenticated Tag output
+	uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+	                      //!< Valid values are 16 (most likely), 12 or 8
+	);
+
+/**
+ * @brief GCM-AES Decryption using 128 bit keys
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_256(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *out,         //!< Plaintext output. Decrypt in-place is allowed
+	uint8_t const *in,    //!< Ciphertext input
+	uint64_t len,         //!< Length of data in Bytes for decryption
+	uint8_t *iv,          //!< iv pointer to 12 byte IV structure.
+	                      //!< Internally, library concates 0x00000001 value to it.
+	uint8_t const *aad,   //!< Additional Authentication Data (AAD)
+	uint64_t aad_len,     //!< Length of AAD
+	uint8_t *auth_tag,    //!< Authenticated Tag output
+	uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+	                      //!< Valid values are 16 (most likely), 12 or 8
+	);
+
+
+/**
+ * @brief Start a AES-GCM Encryption message 128 bit key
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_init_128(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *iv,        //!< Pointer to 12 byte IV structure
+	                    //!< Internally, library concates 0x00000001 value to it
+	uint8_t const *aad, //!< Additional Authentication Data (AAD)
+	uint64_t aad_len    //!< Length of AAD
+	);
+
+/**
+ * @brief Start a AES-GCM Encryption message 256 bit key
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_init_256(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *iv,        //!< Pointer to 12 byte IV structure
+	                    //!< Internally, library concates 0x00000001 value to it
+	uint8_t const *aad, //!< Additional Authentication Data (AAD)
+	uint64_t aad_len    //!< Length of AAD
+	);
+
+/**
+ * @brief Encrypt a block of a AES-128-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_128_update(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *out,       //!< Ciphertext output. Encrypt in-place is allowed.
+	const uint8_t *in,  //!< Plaintext input
+	uint64_t len        //!< Length of data in Bytes for encryption
+	);
+
+/**
+ * @brief Encrypt a block of a AES-256-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_256_update(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *out,       //!< Ciphertext output. Encrypt in-place is allowed.
+	const uint8_t *in,  //!< Plaintext input
+	uint64_t len        //!< Length of data in Bytes for encryption
+	);
+
+/**
+ * @brief Decrypt a block of a AES-128-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_128_update(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *out,       //!< Plaintext output. Decrypt in-place is allowed.
+	const uint8_t *in,  //!< Ciphertext input
+	uint64_t len        //!< Length of data in Bytes for decryption
+	);
+
+/**
+ * @brief Decrypt a block of a AES-256-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_256_update(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *out,       //!< Plaintext output. Decrypt in-place is allowed.
+	const uint8_t *in,  //!< Ciphertext input
+	uint64_t len        //!< Length of data in Bytes for decryption
+	);
+
+/**
+ * @brief End encryption of a AES-128-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_128_finalize(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *auth_tag,     //!< Authenticated Tag output
+	uint64_t auth_tag_len  //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+	                       //!< Valid values are 16 (most likely), 12 or 8
+	);
+
+/**
+ * @brief End encryption of a AES-256-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_256_finalize(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *auth_tag,     //!< Authenticated Tag output
+	uint64_t auth_tag_len  //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+	                       //!< Valid values are 16 (most likely), 12 or 8
+	);
+
+/**
+ * @brief End decryption of a AES-128-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_128_finalize(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *auth_tag,     //!< Authenticated Tag output
+	uint64_t auth_tag_len  //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+	                       //!< Valid values are 16 (most likely), 12 or 8
+	);
+
+/**
+ * @brief End decryption of a AES-256-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_256_finalize(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *auth_tag,     //!< Authenticated Tag output
+	uint64_t auth_tag_len  //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+	                       //!< Valid values are 16 (most likely), 12 or 8
+	);
+
+/**
+ * @brief Pre-processes GCM key data 128 bit
+ *
+ * Prefills the gcm key data with key values for each round and
+ * the initial sub hash key for tag encoding
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_pre_128(
+	const void *key,              //!< Pointer to key data
+	struct gcm_key_data *key_data //!< GCM expanded key data
+	);
+
+/**
+ * @brief Pre-processes GCM key data 128 bit
+ *
+ * Prefills the gcm key data with key values for each round and
+ * the initial sub hash key for tag encoding
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_pre_256(
+	const void *key,              //!< Pointer to key data
+	struct gcm_key_data *key_data //!< GCM expanded key data
+	);
+
+
+
+/* ---- NT versions ---- */
+/**
+ * @brief GCM-AES Encryption using 128 bit keys, Non-temporal data
+ *
+ * Non-temporal version of encrypt has additional restrictions:
+ * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary.
+ * - In-place encryption/decryption is not recommended. Performance can be slow.
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_128_nt(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *out,         //!< Ciphertext output. Encrypt in-place is allowed
+	uint8_t const *in,    //!< Plaintext input
+	uint64_t len,         //!< Length of data in Bytes for encryption
+	uint8_t *iv,          //!< iv pointer to 12 byte IV structure.
+	                      //!< Internally, library concates 0x00000001 value to it.
+	uint8_t const *aad,   //!< Additional Authentication Data (AAD)
+	uint64_t aad_len,     //!< Length of AAD
+	uint8_t *auth_tag,    //!< Authenticated Tag output
+	uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+	                      //!< Valid values are 16 (most likely), 12 or 8
+	);
+
+/**
+ * @brief GCM-AES Encryption using 256 bit keys, Non-temporal data
+ *
+ * Non-temporal version of encrypt has additional restrictions:
+ * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary.
+ * - In-place encryption/decryption is not recommended. Performance can be slow.
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_256_nt(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *out,         //!< Ciphertext output. Encrypt in-place is allowed
+	uint8_t const *in,    //!< Plaintext input
+	uint64_t len,         //!< Length of data in Bytes for encryption
+	uint8_t *iv,          //!< iv pointer to 12 byte IV structure.
+	                      //!< Internally, library concates 0x00000001 value to it.
+	uint8_t const *aad,   //!< Additional Authentication Data (AAD)
+	uint64_t aad_len,     //!< Length of AAD
+	uint8_t *auth_tag,    //!< Authenticated Tag output
+	uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+	                      //!< Valid values are 16 (most likely), 12 or 8
+	);
+
+
+/**
+ * @brief GCM-AES Decryption using 128 bit keys, Non-temporal data
+ *
+ * Non-temporal version of decrypt has additional restrictions:
+ * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary.
+ * - In-place encryption/decryption is not recommended. Performance can be slow.
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_128_nt(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *out,         //!< Plaintext output. Decrypt in-place is allowed
+	uint8_t const *in,    //!< Ciphertext input
+	uint64_t len,         //!< Length of data in Bytes for decryption
+	uint8_t *iv,          //!< iv pointer to 12 byte IV structure.
+	                      //!< Internally, library concates 0x00000001 value to it.
+	uint8_t const *aad,   //!< Additional Authentication Data (AAD)
+	uint64_t aad_len,     //!< Length of AAD
+	uint8_t *auth_tag,    //!< Authenticated Tag output
+	uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+	                      //!< Valid values are 16 (most likely), 12 or 8
+	);
+
+/**
+ * @brief GCM-AES Decryption using 128 bit keys, Non-temporal data
+ *
+ * Non-temporal version of decrypt has additional restrictions:
+ * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary.
+ * - In-place encryption/decryption is not recommended. Performance can be slow.
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_256_nt(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *out,         //!< Plaintext output. Decrypt in-place is allowed
+	uint8_t const *in,    //!< Ciphertext input
+	uint64_t len,         //!< Length of data in Bytes for decryption
+	uint8_t *iv,          //!< iv pointer to 12 byte IV structure.
+	                      //!< Internally, library concates 0x00000001 value to it.
+	uint8_t const *aad,   //!< Additional Authentication Data (AAD)
+	uint64_t aad_len,     //!< Length of AAD
+	uint8_t *auth_tag,    //!< Authenticated Tag output
+	uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+	                      //!< Valid values are 16 (most likely), 12 or 8
+	);
+
+
+/**
+ * @brief Encrypt a block of a AES-128-GCM Encryption message, Non-temporal data
+ *
+ * Non-temporal version of encrypt update has additional restrictions:
+ * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary.
+ * - All partial input buffers must be a multiple of 64 bytes long except for
+ *   the last input buffer.
+ * - In-place encryption/decryption is not recommended. Performance can be slow.
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_128_update_nt(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *out,       //!< Ciphertext output. Encrypt in-place is allowed.
+	const uint8_t *in,  //!< Plaintext input
+	uint64_t len        //!< Length of data in Bytes for encryption
+	);
+
+/**
+ * @brief Encrypt a block of a AES-256-GCM Encryption message, Non-temporal data
+ *
+ * Non-temporal version of encrypt update has additional restrictions:
+ * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary.
+ * - All partial input buffers must be a multiple of 64 bytes long except for
+ *   the last input buffer.
+ * - In-place encryption/decryption is not recommended. Performance can be slow.
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_256_update_nt(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *out,       //!< Ciphertext output. Encrypt in-place is allowed.
+	const uint8_t *in,  //!< Plaintext input
+	uint64_t len        //!< Length of data in Bytes for encryption
+	);
+
+/**
+ * @brief Decrypt a block of a AES-128-GCM Encryption message, Non-temporal data
+ *
+ * Non-temporal version of decrypt update has additional restrictions:
+ * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary.
+ * - All partial input buffers must be a multiple of 64 bytes long except for
+ *   the last input buffer.
+ * - In-place encryption/decryption is not recommended. Performance can be slow.
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_128_update_nt(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *out,       //!< Plaintext output. Decrypt in-place is allowed.
+	const uint8_t *in,  //!< Ciphertext input
+	uint64_t len        //!< Length of data in Bytes for decryption
+	);
+
+/**
+ * @brief Decrypt a block of a AES-256-GCM Encryption message, Non-temporal data
+ *
+ * Non-temporal version of decrypt update has additional restrictions:
+ * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary.
+ * - All partial input buffers must be a multiple of 64 bytes long except for
+ *   the last input buffer.
+ * - In-place encryption/decryption is not recommended. Performance can be slow.
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_256_update_nt(
+	const struct gcm_key_data *key_data,   //!< GCM expanded key data
+	struct gcm_context_data *context_data, //!< GCM operation context data
+	uint8_t *out,       //!< Plaintext output. Decrypt in-place is allowed.
+	const uint8_t *in,  //!< Ciphertext input
+	uint64_t len        //!< Length of data in Bytes for decryption
+	);
+
+
+#ifdef __cplusplus
+}
+#endif //__cplusplus
+#endif //ifndef _AES_GCM_h
diff --git a/src/crypto/isa-l/isa-l_crypto/include/aes_keyexp.h b/src/crypto/isa-l/isa-l_crypto/include/aes_keyexp.h
new file mode 100644
index 000000000..6ecded301
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/aes_keyexp.h
@@ -0,0 +1,76 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _KEYEXP_128_H
+#define _KEYEXP_128_H
+
+/**
+ *  @file aes_keyexp.h
+ *  @brief AES key expansion functions
+ *
+ * This defines the interface to key expansion functions.
+ */
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @brief AES key expansion 128 bit
+* @requires SSE4.1
+*/
+void aes_keyexp_128(
+	const uint8_t *key,  		//!< input key for AES-128, 16 bytes
+	uint8_t *exp_key_enc,	//!< expanded encryption keys, 16*11 bytes
+	uint8_t *exp_key_dec	//!< expanded decryption keys, 16*11 bytes
+	);
+
+/** @brief AES key expansion 192 bit
+* @requires SSE4.1
+*/
+void aes_keyexp_192(
+	const uint8_t *key,	//!< input key for AES-192, 16*1.5 bytes
+	uint8_t *exp_key_enc,	//!< expanded encryption keys, 16*13 bytes
+	uint8_t *exp_key_dec	//!< expanded decryption keys, 16*13 bytes
+	);
+
+/** @brief AES key expansion 256 bit
+* @requires SSE4.1
+*/
+void aes_keyexp_256(
+	const uint8_t *key,	//!< input key for AES-256, 16*2 bytes
+	uint8_t *exp_key_enc,	//!< expanded encryption keys, 16*15 bytes
+	uint8_t *exp_key_dec	//!< expanded decryption keys, 16*15 bytes
+	);
+
+#ifdef __cplusplus
+}
+#endif //__cplusplus
+#endif //ifndef _KEYEXP_128_H
diff --git a/src/crypto/isa-l/isa-l_crypto/include/aes_xts.h b/src/crypto/isa-l/isa-l_crypto/include/aes_xts.h
new file mode 100644
index 000000000..2021284f5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/aes_xts.h
@@ -0,0 +1,214 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#ifndef _AES_XTS_H
+#define _AES_XTS_H
+
+/**
+ *  @file aes_xts.h
+ *  @brief AES XTS encryption function prototypes.
+ *
+ * This defines the interface to optimized AES XTS functions
+ *
+ * <b>Pre-expanded keys</b>
+ *
+ * For key encryption, pre-expanded keys are stored in the order that they will be
+ * used. As an example, if Key[0] is the 128-bit initial key used for an AES-128
+ * encryption, the rest of the keys are stored as follows:
+ *
+ * <ul>
+ *       <li> Key[0] : Initial encryption key
+ *       <li> Key[1] : Round 1 encryption key
+ *       <li> Key[2] : Round 2 encryption key
+ *       <li> ...
+ *      <li> Key[10] : Round 10 encryption key
+ * </ul>
+ *
+ * For decryption, the order of keys is reversed. However, we apply the
+ * necessary aesimc instructions before storing the expanded keys. For the same key
+ * used above, the pre-expanded keys will be stored as follows:
+ *
+ * <ul>
+ *      <li> Key[0] : Round 10 encryption key
+ *      <li> Key[1] : aesimc(Round 9 encryption key)
+ *      <li> Key[2] : aesimc(Round 8 encryption key)
+ *      <li> ...
+ *      <li> Key[9] : aesimc(Round 1 encryption key)
+ *      <li> Key[10] : Initial encryption key
+ * </ul>
+ *
+ * <b>Note:</b> The expanded key decryption requires a decryption key only for the block
+ * decryption step. The tweak step in the expanded key decryption requires the same expanded
+ * encryption key that is used in the expanded key encryption.
+ *
+ * <b>Input and Output Buffers </b>
+ *
+ * The input and output buffers can be overlapping as long as the output buffer
+ * pointer is not less than the input buffer pointer. If the two pointers are the
+ * same, then encryption/decryption will occur in-place.
+ *
+ * <b>Data Length</b>
+ *
+ * <ul>
+ *     <li> The functions support data length of any bytes greater than or equal to 16 bytes.
+ *     <li> Data length is a 64-bit value, which makes the largest possible data length
+ *          2^64 - 1 bytes.
+ *     <li> For data lengths from 0 to 15 bytes, the functions return without any error
+ *          codes, without reading or writing any data.
+ *     <li> The functions only support byte lengths, not bits.
+ * </ul>
+ *
+ * <b>Initial Tweak</b>
+ *
+ * The functions accept a 128-bit initial tweak value. The user is responsible for
+ * padding the initial tweak value to this length.
+ *
+ * <b>Data Alignment</b>
+ *
+ * The input and output buffers, keys, pre-expanded keys and initial tweak value
+ * are not required to be aligned to 16 bytes, any alignment works.
+ *
+ */
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @brief XTS-AES-128 Encryption
+ * @requires AES-NI
+ */
+
+void XTS_AES_128_enc(
+	uint8_t *k2,	//!<  key used for tweaking, 16 bytes
+	uint8_t *k1,	//!<  key used for encryption of tweaked plaintext, 16 bytes
+	uint8_t *TW_initial,	//!<  initial tweak value, 16 bytes
+	uint64_t N,	//!<  sector size, in bytes
+	const uint8_t *pt,	//!<  plaintext sector input data
+	uint8_t *ct	//!<  ciphertext sector output data
+	);
+
+/** @brief XTS-AES-128 Encryption with pre-expanded keys
+ * @requires AES-NI
+ */
+
+void XTS_AES_128_enc_expanded_key(
+	uint8_t *k2,	//!<  expanded key used for tweaking, 16*11 bytes
+	uint8_t *k1,	//!<  expanded key used for encryption of tweaked plaintext, 16*11 bytes
+	uint8_t *TW_initial,	//!<  initial tweak value, 16 bytes
+	uint64_t N,	//!<  sector size, in bytes
+	const uint8_t *pt,	//!<  plaintext sector input data
+	uint8_t *ct	//!<  ciphertext sector output data
+	);
+
+/** @brief XTS-AES-128 Decryption
+ * @requires AES-NI
+ */
+
+void XTS_AES_128_dec(
+	uint8_t *k2,	//!<  key used for tweaking, 16 bytes
+	uint8_t *k1,	//!<  key used for decryption of tweaked ciphertext, 16 bytes
+	uint8_t *TW_initial,	//!<  initial tweak value, 16 bytes
+	uint64_t N,	//!<  sector size, in bytes
+	const uint8_t *ct,	//!<  ciphertext sector input data
+	uint8_t *pt	//!<  plaintext sector output data
+	);
+
+/** @brief XTS-AES-128 Decryption with pre-expanded keys
+ * @requires AES-NI
+ */
+
+void XTS_AES_128_dec_expanded_key(
+	uint8_t *k2,	//!<  expanded key used for tweaking, 16*11 bytes - encryption key is used
+	uint8_t *k1,	//!<  expanded decryption key used for decryption of tweaked ciphertext, 16*11 bytes
+	uint8_t *TW_initial,	//!<  initial tweak value, 16 bytes
+	uint64_t N,	//!<  sector size, in bytes
+	const uint8_t *ct,	//!<  ciphertext sector input data
+	uint8_t *pt	//!<  plaintext sector output data
+	);
+
+/** @brief XTS-AES-256 Encryption
+ * @requires AES-NI
+ */
+
+void XTS_AES_256_enc(
+	uint8_t *k2,	//!<  key used for tweaking, 16*2 bytes
+	uint8_t *k1,	//!<  key used for encryption of tweaked plaintext, 16*2 bytes
+	uint8_t *TW_initial,	//!<  initial tweak value, 16 bytes
+	uint64_t N,	//!<  sector size, in bytes
+	const uint8_t *pt,	//!<  plaintext sector input data
+	uint8_t *ct	//!<  ciphertext sector output data
+	);
+
+/** @brief XTS-AES-256 Encryption with pre-expanded keys
+ * @requires AES-NI
+ */
+
+void XTS_AES_256_enc_expanded_key(
+	uint8_t *k2,	//!<  expanded key used for tweaking, 16*15 bytes
+	uint8_t *k1,	//!<  expanded key used for encryption of tweaked plaintext, 16*15 bytes
+	uint8_t *TW_initial,	//!<  initial tweak value, 16 bytes
+	uint64_t N,	//!<  sector size, in bytes
+	const uint8_t *pt,	//!<  plaintext sector input data
+	uint8_t *ct	//!<  ciphertext sector output data
+	);
+
+/** @brief XTS-AES-256 Decryption
+ * @requires AES-NI
+ */
+
+void XTS_AES_256_dec(
+	uint8_t *k2,	//!<  key used for tweaking, 16*2 bytes
+	uint8_t *k1,	//!<  key used for  decryption of tweaked ciphertext, 16*2 bytes
+	uint8_t *TW_initial,	//!<  initial tweak value, 16 bytes
+	uint64_t N,	//!<  sector size, in bytes
+	const uint8_t *ct,	//!<  ciphertext sector input data
+	uint8_t *pt	//!<  plaintext sector output data
+	);
+
+/** @brief XTS-AES-256 Decryption with pre-expanded keys
+ * @requires AES-NI
+ */
+
+void XTS_AES_256_dec_expanded_key(
+	uint8_t *k2,	//!<  expanded key used for tweaking, 16*15 bytes - encryption key is used
+	uint8_t *k1,	//!<  expanded decryption key used for decryption of tweaked ciphertext, 16*15 bytes
+	uint8_t *TW_initial,	//!<  initial tweak value, 16 bytes
+	uint64_t N,	//!<  sector size, in bytes
+	const uint8_t *ct,	//!<  ciphertext sector input data
+	uint8_t *pt	//!<  plaintext sector output data
+	);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_AES_XTS_H
diff --git a/src/crypto/isa-l/isa-l_crypto/include/datastruct.asm b/src/crypto/isa-l/isa-l_crypto/include/datastruct.asm
new file mode 100644
index 000000000..3298ce374
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/datastruct.asm
@@ -0,0 +1,79 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Macros for defining data structures
+
+; Usage example
+
+;START_FIELDS	; JOB_AES
+;;;	name		size	align
+;FIELD	_plaintext,	8,	8	; pointer to plaintext
+;FIELD	_ciphertext,	8,	8	; pointer to ciphertext
+;FIELD	_IV,		16,	8	; IV
+;FIELD	_keys,		8,	8	; pointer to keys
+;FIELD	_len,		4,	4	; length in bytes
+;FIELD	_status,	4,	4	; status enumeration
+;FIELD	_user_data,	8,	8	; pointer to user data
+;UNION  _union,         size1,  align1, \
+	                size2,  align2, \
+	                size3,  align3, \
+	                ...
+;END_FIELDS
+;%assign _JOB_AES_size	_FIELD_OFFSET
+;%assign _JOB_AES_align	_STRUCT_ALIGN
+
+%ifndef _DATASTRUCT_ASM_
+%define _DATASTRUCT_ASM_
+
+;; START_FIELDS
+%macro START_FIELDS 0
+%assign _FIELD_OFFSET 0
+%assign _STRUCT_ALIGN 0
+%endm
+
+;; FIELD name size align
+%macro FIELD 3
+%define %%name  %1
+%define %%size  %2
+%define %%align %3
+
+%assign _FIELD_OFFSET (_FIELD_OFFSET + (%%align) - 1) & (~ ((%%align)-1))
+%%name	equ	_FIELD_OFFSET
+%assign _FIELD_OFFSET _FIELD_OFFSET + (%%size)
+%if (%%align > _STRUCT_ALIGN)
+%assign _STRUCT_ALIGN %%align
+%endif
+%endm
+
+;; END_FIELDS
+%macro END_FIELDS 0
+%assign _FIELD_OFFSET (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
+%endm
+
+%endif ; end ifdef _DATASTRUCT_ASM_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/endian_helper.h b/src/crypto/isa-l/isa-l_crypto/include/endian_helper.h
new file mode 100644
index 000000000..87d90460a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/endian_helper.h
@@ -0,0 +1,83 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _ENDIAN_HELPER_H_
+#define _ENDIAN_HELPER_H_
+
+/**
+ *  @file  endian_helper.h
+ *  @brief Byte order helper routines
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined (__ICC)
+# define byteswap32(x) _bswap(x)
+# define byteswap64(x) _bswap64(x)
+#elif defined (__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+# define byteswap32(x) __builtin_bswap32(x)
+# define byteswap64(x) __builtin_bswap64(x)
+#else
+# define byteswap32(x) (  ((x) << 24) \
+                        | (((x) & 0xff00) << 8) \
+                        | (((x) & 0xff0000) >> 8) \
+                        | ((x)>>24))
+# define byteswap64(x) (  (((x) & (0xffull << 0)) << 56) \
+                        | (((x) & (0xffull << 8)) << 40) \
+                        | (((x) & (0xffull << 16)) << 24) \
+                        | (((x) & (0xffull << 24)) << 8) \
+                        | (((x) & (0xffull << 32)) >> 8) \
+                        | (((x) & (0xffull << 40)) >> 24) \
+                        | (((x) & (0xffull << 48)) >> 40) \
+                        | (((x) & (0xffull << 56)) >> 56))
+#endif
+
+// This check works when using GCC (or LLVM).  Assume little-endian
+// if any other compiler is being used.
+#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) \
+    && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define to_le32(x) byteswap32(x)
+#define to_le64(x) byteswap64(x)
+#define to_be32(x) (x)
+#define to_be64(x) (x)
+#else
+#define to_le32(x) (x)
+#define to_le64(x) (x)
+#define to_be32(x) byteswap32(x)
+#define to_be64(x) byteswap64(x)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _ISA_HELPER_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/intrinreg.h b/src/crypto/isa-l/isa-l_crypto/include/intrinreg.h
new file mode 100644
index 000000000..3c7ba2877
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/intrinreg.h
@@ -0,0 +1,65 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+/**
+ *  @file  intrinreg.h
+ *  @brief Defines intrinsic types used by the new hashing API
+ *
+ */
+
+#ifndef _IA64_REGS_H_
+#define _IA64_REGS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+#include <stdint.h>
+#include <immintrin.h>
+
+// Define available register types uniformly.
+/// @cond
+typedef struct{ uint8_t  dummy; } intrinreg1;
+typedef struct{ uint16_t dummy; } intrinreg2;
+typedef struct{ uint32_t dummy; } intrinreg4;
+typedef struct{ uint64_t dummy; } intrinreg8;
+typedef __m128                    intrinreg16;
+/// @endcond
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _IA64_REGS_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/md5_mb.h b/src/crypto/isa-l/isa-l_crypto/include/md5_mb.h
new file mode 100644
index 000000000..fcbae5f62
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/md5_mb.h
@@ -0,0 +1,372 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MD5_MB_H_
+#define _MD5_MB_H_
+
+/**
+ *  @file md5_mb.h
+ *  @brief Multi-buffer CTX API MD5 function prototypes and structures
+ *
+ * Interface for multi-buffer MD5 functions
+ *
+ * <b> Multi-buffer MD5  Entire or First-Update..Update-Last </b>
+ *
+ * The interface to this multi-buffer hashing code is carried out through the
+ * context-level (CTX) init, submit and flush functions and the MD5_HASH_CTX_MGR and
+ * MD5_HASH_CTX objects. Numerous MD5_HASH_CTX objects may be instantiated by the
+ * application for use with a single MD5_HASH_CTX_MGR.
+ *
+ * The CTX interface functions carry out the initialization and padding of the jobs
+ * entered by the user and add them to the multi-buffer manager. The lower level "scheduler"
+ * layer then processes the jobs in an out-of-order manner. The scheduler layer functions
+ * are internal and are not intended to be invoked directly. Jobs can be submitted
+ * to a CTX as a complete buffer to be hashed, using the HASH_ENTIRE flag, or as partial
+ * jobs which can be started using the HASH_FIRST flag, and later resumed or finished
+ * using the HASH_UPDATE and HASH_LAST flags respectively.
+ *
+ * <b>Note:</b> The submit function does not require data buffers to be block sized.
+ *
+ * The MD5 CTX interface functions are available for 4 architectures: SSE, AVX, AVX2 and
+ * AVX512. In addition, a multibinary interface is provided, which selects the appropriate
+ * architecture-specific function at runtime.
+ *
+ * <b>Usage:</b> The application creates a MD5_HASH_CTX_MGR object and initializes it
+ * with a call to md5_ctx_mgr_init*() function, where henceforth "*" stands for the
+ * relevant suffix for each architecture; _sse, _avx, _avx2, _avx512 (or no suffix for the
+ * multibinary version). The MD5_HASH_CTX_MGR object will be used to schedule processor
+ * resources, with up to 8 MD5_HASH_CTX objects (or 16 in AVX2 case, 32 in AVX512 case)
+ * being processed at a time.
+ *
+ * Each MD5_HASH_CTX must be initialized before first use by the hash_ctx_init macro
+ * defined in multi_buffer.h. After initialization, the application may begin computing
+ * a hash by giving the MD5_HASH_CTX to a MD5_HASH_CTX_MGR using the submit functions
+ * md5_ctx_mgr_submit*() with the HASH_FIRST flag set. When the MD5_HASH_CTX is
+ * returned to the application (via this or a later call to md5_ctx_mgr_submit*() or
+ * md5_ctx_mgr_flush*()), the application can then re-submit it with another call to
+ * md5_ctx_mgr_submit*(), but without the HASH_FIRST flag set.
+ *
+ * Ideally, on the last buffer for that hash, md5_ctx_mgr_submit_sse is called with
+ * HASH_LAST, although it is also possible to submit the hash with HASH_LAST and a zero
+ * length if necessary. When a MD5_HASH_CTX is returned after having been submitted with
+ * HASH_LAST, it will contain a valid hash. The MD5_HASH_CTX can be reused immediately
+ * by submitting with HASH_FIRST.
+ *
+ * For example, you would submit hashes with the following flags for the following numbers
+ * of buffers:
+ * <ul>
+ *  <li> one buffer: HASH_FIRST | HASH_LAST  (or, equivalently, HASH_ENTIRE)
+ *  <li> two buffers: HASH_FIRST, HASH_LAST
+ *  <li> three buffers: HASH_FIRST, HASH_UPDATE, HASH_LAST
+ * etc.
+ * </ul>
+ *
+ * The order in which MD5_CTX objects are returned is in general different from the order
+ * in which they are submitted.
+ *
+ * A few possible error conditions exist:
+ * <ul>
+ *  <li> Submitting flags other than the allowed entire/first/update/last values
+ *  <li> Submitting a context that is currently being managed by a MD5_HASH_CTX_MGR.
+ *  <li> Submitting a context after HASH_LAST is used but before HASH_FIRST is set.
+ * </ul>
+ *
+ *  These error conditions are reported by returning the MD5_HASH_CTX immediately after
+ *  a submit with its error member set to a non-zero error code (defined in
+ *  multi_buffer.h). No changes are made to the MD5_HASH_CTX_MGR in the case of an
+ *  error; no processing is done for other hashes.
+ *
+ */
+
+#include <stdint.h>
+#include "multi_buffer.h"
+#include "types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Hash Constants and Typedefs
+#define MD5_DIGEST_NWORDS	4
+#define MD5_MAX_LANES		32
+#define MD5_MIN_LANES		8
+#define MD5_BLOCK_SIZE		64
+#define MD5_LOG2_BLOCK_SIZE	6
+#define MD5_PADLENGTHFIELD_SIZE	8
+#define MD5_INITIAL_DIGEST	\
+	0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476
+
+typedef uint32_t md5_digest_array[MD5_DIGEST_NWORDS][MD5_MAX_LANES];
+typedef uint32_t MD5_WORD_T;
+
+/** @brief Scheduler layer - Holds info describing a single MD5 job for the multi-buffer manager */
+
+typedef struct {
+    uint8_t*  buffer;	//!< pointer to data buffer for this job
+    uint32_t  len;	//!< length of buffer for this job in blocks.
+    DECLARE_ALIGNED(uint32_t result_digest[MD5_DIGEST_NWORDS],64);
+    JOB_STS status;	//!< output job status
+    void*   user_data;	//!< pointer for user's job-related data
+} MD5_JOB;
+
+/** @brief Scheduler layer -  Holds arguments for submitted MD5 job */
+
+typedef struct {
+    md5_digest_array digest;
+    uint8_t*       data_ptr[MD5_MAX_LANES];
+} MD5_MB_ARGS_X32;
+
+/** @brief Scheduler layer - Lane data */
+
+typedef struct {
+    MD5_JOB *job_in_lane;
+} MD5_LANE_DATA;
+
+/** @brief Scheduler layer - Holds state for multi-buffer MD5 jobs */
+
+typedef struct {
+    MD5_MB_ARGS_X32 args;
+    uint32_t lens[MD5_MAX_LANES];
+    uint64_t unused_lanes[4]; //!< each byte or nibble is index (0...31 or 15) of unused lanes.
+    MD5_LANE_DATA ldata[MD5_MAX_LANES];
+    uint32_t num_lanes_inuse;
+} MD5_MB_JOB_MGR;
+
+/** @brief Context layer - Holds state for multi-buffer MD5 jobs */
+
+typedef struct {
+	MD5_MB_JOB_MGR mgr;
+} MD5_HASH_CTX_MGR;
+
+/** @brief Context layer - Holds info describing a single MD5 job for the multi-buffer CTX manager */
+
+typedef struct {
+	MD5_JOB        job;             // Must be at struct offset 0.
+	HASH_CTX_STS   status;		//!< Context status flag
+	HASH_CTX_ERROR error;		//!< Context error flag
+	uint64_t       total_length;	//!< Running counter of length processed for this CTX's job
+	const void*    incoming_buffer; //!< pointer to data input buffer for this CTX's job
+	uint32_t       incoming_buffer_length; //!< length of buffer for this job in bytes.
+	uint8_t        partial_block_buffer[MD5_BLOCK_SIZE * 2]; //!< CTX partial blocks
+	uint32_t       partial_block_buffer_length;
+	void*          user_data;	//!< pointer for user to keep any job-related data
+} MD5_HASH_CTX;
+
+/*******************************************************************
+ * CTX level API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the context level MD5 multi-buffer manager structure.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void      md5_ctx_mgr_init_sse   (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new MD5 job to the context level multi-buffer manager.
+ * @requires SSE4.1
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_submit_sse (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx,
+				const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted MD5 jobs and return when complete.
+ * @requires SSE4.1
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_flush_sse  (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the MD5 multi-buffer manager structure.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void      md5_ctx_mgr_init_avx   (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new MD5 job to the multi-buffer manager.
+ * @requires AVX
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_submit_avx (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx,
+				const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted MD5 jobs and return when complete.
+ * @requires AVX
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_flush_avx  (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the MD5 multi-buffer manager structure.
+ * @requires AVX2
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns void
+ */
+void      md5_ctx_mgr_init_avx2   (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new MD5 job to the multi-buffer manager.
+ * @requires AVX2
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_submit_avx2 (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx,
+				const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted MD5 jobs and return when complete.
+ * @requires AVX2
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_flush_avx2  (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the MD5 multi-buffer manager structure.
+ * @requires AVX512
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns void
+ */
+void      md5_ctx_mgr_init_avx512   (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new MD5 job to the multi-buffer manager.
+ * @requires AVX512
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_submit_avx512 (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx,
+				const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted MD5 jobs and return when complete.
+ * @requires AVX512
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_flush_avx512  (MD5_HASH_CTX_MGR* mgr);
+
+/******************** multibinary function prototypes **********************/
+
+/**
+ * @brief Initialize the MD5 multi-buffer manager structure.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns void
+ */
+void      md5_ctx_mgr_init   (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new MD5 job to the multi-buffer manager.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_submit (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx,
+				const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted MD5 jobs and return when complete.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_flush  (MD5_HASH_CTX_MGR* mgr);
+
+
+/*******************************************************************
+ * Scheduler (internal) level out-of-order function prototypes
+ ******************************************************************/
+
+void     md5_mb_mgr_init_sse           (MD5_MB_JOB_MGR *state);
+MD5_JOB* md5_mb_mgr_submit_sse         (MD5_MB_JOB_MGR *state, MD5_JOB* job);
+MD5_JOB* md5_mb_mgr_flush_sse          (MD5_MB_JOB_MGR *state);
+
+#define  md5_mb_mgr_init_avx           md5_mb_mgr_init_sse
+MD5_JOB* md5_mb_mgr_submit_avx         (MD5_MB_JOB_MGR *state, MD5_JOB* job);
+MD5_JOB* md5_mb_mgr_flush_avx          (MD5_MB_JOB_MGR *state);
+
+void     md5_mb_mgr_init_avx2           (MD5_MB_JOB_MGR *state);
+MD5_JOB* md5_mb_mgr_submit_avx2         (MD5_MB_JOB_MGR *state, MD5_JOB* job);
+MD5_JOB* md5_mb_mgr_flush_avx2          (MD5_MB_JOB_MGR *state);
+
+void  md5_mb_mgr_init_avx512            (MD5_MB_JOB_MGR *state);
+MD5_JOB* md5_mb_mgr_submit_avx512       (MD5_MB_JOB_MGR *state, MD5_JOB* job);
+MD5_JOB* md5_mb_mgr_flush_avx512        (MD5_MB_JOB_MGR *state);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _MD5_MB_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/memcpy.asm b/src/crypto/isa-l/isa-l_crypto/include/memcpy.asm
new file mode 100644
index 000000000..7cb153540
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/memcpy.asm
@@ -0,0 +1,615 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifndef __MEMCPY_ASM__
+%define __MEMCPY_ASM__
+
+%include "reg_sizes.asm"
+
+
+; This file defines a series of macros to copy small to medium amounts
+; of data from memory to memory, where the size is variable but limited.
+;
+; The macros are all called as:
+; memcpy DST, SRC, SIZE, TMP0, TMP1, XTMP0, XTMP1, XTMP2, XTMP3
+; with the parameters defined as:
+;    DST     : register: pointer to dst (not modified)
+;    SRC     : register: pointer to src (not modified)
+;    SIZE    : register: length in bytes (not modified)
+;    TMP0    : 64-bit temp GPR (clobbered)
+;    TMP1    : 64-bit temp GPR (clobbered)
+;    XTMP0   : temp XMM (clobbered)
+;    XTMP1   : temp XMM (clobbered)
+;    XTMP2   : temp XMM (clobbered)
+;    XTMP3   : temp XMM (clobbered)
+;
+; The name indicates the options. The name is of the form:
+; memcpy_<VEC>_<SZ><ZERO><RET>
+; where:
+; <VEC> is either "sse" or "avx" or "avx2"
+; <SZ> is either "64" or "128" and defines largest value of SIZE
+; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
+; <RET> is blank or "_ret". If blank, the code falls through. If "ret"
+;                           it does a "ret" at the end
+;
+; For the avx2 versions, the temp XMM registers need to be YMM registers
+; If the SZ is 64, then only two YMM temps are needed, i.e. it is called as:
+; memcpy_avx2_64 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1
+; memcpy_avx2_128 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1, YTMP2, YTMP3
+;
+; For example:
+; memcpy_sse_64		: SSE,  0 <= size < 64, falls through
+; memcpy_avx_64_1	: AVX1, 1 <= size < 64, falls through
+; memcpy_sse_128_ret	: SSE,  0 <= size < 128, ends with ret
+; mempcy_avx_128_1_ret	: AVX1, 1 <= size < 128, ends with ret
+;
+
+%macro memcpy_sse_64 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 0
+%endm
+
+%macro memcpy_sse_64_1 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 0
+%endm
+
+%macro memcpy_sse_128 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 0
+%endm
+
+%macro memcpy_sse_128_1 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 0
+%endm
+
+%macro memcpy_sse_64_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 0
+%endm
+
+%macro memcpy_sse_64_1_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 0
+%endm
+
+%macro memcpy_sse_128_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 0
+%endm
+
+%macro memcpy_sse_128_1_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 0
+%endm
+
+
+%macro memcpy_sse_16 5
+	__memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 0
+%endm
+
+%macro memcpy_sse_16_1 5
+	__memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 0
+%endm
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro memcpy_avx_64 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 1
+%endm
+
+%macro memcpy_avx_64_1 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 1
+%endm
+
+%macro memcpy_avx_128 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 1
+%endm
+
+%macro memcpy_avx_128_1 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 1
+%endm
+
+%macro memcpy_avx_64_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 1
+%endm
+
+%macro memcpy_avx_64_1_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 1
+%endm
+
+%macro memcpy_avx_128_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 1
+%endm
+
+%macro memcpy_avx_128_1_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 1
+%endm
+
+
+%macro memcpy_avx_16 5
+	__memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 1
+%endm
+
+%macro memcpy_avx_16_1 5
+	__memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 1
+%endm
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro memcpy_avx2_64 7
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 0, 2
+%endm
+
+%macro memcpy_avx2_64_1 7
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 0, 2
+%endm
+
+%macro memcpy_avx2_128 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 0, 128, 0, 2
+%endm
+
+%macro memcpy_avx2_128_1 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 1, 128, 0, 2
+%endm
+
+%macro memcpy_avx2_64_ret 7
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 1, 2
+%endm
+
+%macro memcpy_avx2_64_1_ret 7
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 1, 2
+%endm
+
+%macro memcpy_avx2_128_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 128, 1, 2
+%endm
+
+%macro memcpy_avx2_128_1_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 128, 1, 2
+%endm
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+%macro __memcpy_int 13
+%define %%DST     %1	; register: pointer to dst (not modified)
+%define %%SRC     %2	; register: pointer to src (not modified)
+%define %%SIZE    %3	; register: length in bytes (not modified)
+%define %%TMP0    %4	; 64-bit temp GPR (clobbered)
+%define %%TMP1    %5	; 64-bit temp GPR (clobbered)
+%define %%XTMP0   %6	; temp XMM (clobbered)
+%define %%XTMP1   %7	; temp XMM (clobbered)
+%define %%XTMP2   %8	; temp XMM (clobbered)
+%define %%XTMP3   %9	; temp XMM (clobbered)
+%define %%NOT0    %10	; if not 0, then assume size cannot be zero
+%define %%MAXSIZE %11	; 128, 64, etc
+%define %%USERET  %12   ; if not 0, use "ret" at end
+%define %%USEAVX  %13   ; 0 = SSE, 1 = AVX1, 2 = AVX2
+
+%if (%%USERET != 0)
+ %define %%DONE	ret
+%else
+ %define %%DONE jmp %%end
+%endif
+
+%if (%%USEAVX != 0)
+ %define %%MOVDQU vmovdqu
+%else
+ %define %%MOVDQU movdqu
+%endif
+
+%if (%%MAXSIZE >= 128)
+	test	%%SIZE, 64
+	jz	%%lt64
+  %if (%%USEAVX >= 2)
+	%%MOVDQU	%%XTMP0, [%%SRC + 0*32]
+	%%MOVDQU	%%XTMP1, [%%SRC + 1*32]
+	%%MOVDQU	%%XTMP2, [%%SRC + %%SIZE - 2*32]
+	%%MOVDQU	%%XTMP3, [%%SRC + %%SIZE - 1*32]
+
+	%%MOVDQU	[%%DST + 0*32], %%XTMP0
+	%%MOVDQU	[%%DST + 1*32], %%XTMP1
+	%%MOVDQU	[%%DST + %%SIZE - 2*32], %%XTMP2
+	%%MOVDQU	[%%DST + %%SIZE - 1*32], %%XTMP3
+  %else
+	%%MOVDQU	%%XTMP0, [%%SRC + 0*16]
+	%%MOVDQU	%%XTMP1, [%%SRC + 1*16]
+	%%MOVDQU	%%XTMP2, [%%SRC + 2*16]
+	%%MOVDQU	%%XTMP3, [%%SRC + 3*16]
+	%%MOVDQU	[%%DST + 0*16], %%XTMP0
+	%%MOVDQU	[%%DST + 1*16], %%XTMP1
+	%%MOVDQU	[%%DST + 2*16], %%XTMP2
+	%%MOVDQU	[%%DST + 3*16], %%XTMP3
+
+	%%MOVDQU	%%XTMP0, [%%SRC + %%SIZE - 4*16]
+	%%MOVDQU	%%XTMP1, [%%SRC + %%SIZE - 3*16]
+	%%MOVDQU	%%XTMP2, [%%SRC + %%SIZE - 2*16]
+	%%MOVDQU	%%XTMP3, [%%SRC + %%SIZE - 1*16]
+	%%MOVDQU	[%%DST + %%SIZE - 4*16], %%XTMP0
+	%%MOVDQU	[%%DST + %%SIZE - 3*16], %%XTMP1
+	%%MOVDQU	[%%DST + %%SIZE - 2*16], %%XTMP2
+	%%MOVDQU	[%%DST + %%SIZE - 1*16], %%XTMP3
+  %endif
+	%%DONE
+%endif
+
+%if (%%MAXSIZE >= 64)
+%%lt64
+	test	%%SIZE, 32
+	jz	%%lt32
+  %if (%%USEAVX >= 2)
+	%%MOVDQU	%%XTMP0, [%%SRC + 0*32]
+	%%MOVDQU	%%XTMP1, [%%SRC + %%SIZE - 1*32]
+	%%MOVDQU	[%%DST + 0*32], %%XTMP0
+	%%MOVDQU	[%%DST + %%SIZE - 1*32], %%XTMP1
+  %else
+	%%MOVDQU	%%XTMP0, [%%SRC + 0*16]
+	%%MOVDQU	%%XTMP1, [%%SRC + 1*16]
+	%%MOVDQU	%%XTMP2, [%%SRC + %%SIZE - 2*16]
+	%%MOVDQU	%%XTMP3, [%%SRC + %%SIZE - 1*16]
+	%%MOVDQU	[%%DST + 0*16], %%XTMP0
+	%%MOVDQU	[%%DST + 1*16], %%XTMP1
+	%%MOVDQU	[%%DST + %%SIZE - 2*16], %%XTMP2
+	%%MOVDQU	[%%DST + %%SIZE - 1*16], %%XTMP3
+  %endif
+	%%DONE
+%endif
+
+%if (%%MAXSIZE >= 32)
+%%lt32:
+	test	%%SIZE, 16
+	jz	%%lt16
+  %if (%%USEAVX >= 2)
+	%%MOVDQU	XWORD(%%XTMP0), [%%SRC + 0*16]
+	%%MOVDQU	XWORD(%%XTMP1), [%%SRC + %%SIZE - 1*16]
+	%%MOVDQU	[%%DST + 0*16], XWORD(%%XTMP0)
+	%%MOVDQU	[%%DST + %%SIZE - 1*16], XWORD(%%XTMP1)
+  %else
+	%%MOVDQU	%%XTMP0, [%%SRC + 0*16]
+	%%MOVDQU	%%XTMP1, [%%SRC + %%SIZE - 1*16]
+	%%MOVDQU	[%%DST + 0*16], %%XTMP0
+	%%MOVDQU	[%%DST + %%SIZE - 1*16], %%XTMP1
+  %endif
+	%%DONE
+%endif
+
+%if (%%MAXSIZE >= 16)
+%%lt16:
+	test	%%SIZE, 8
+	jz	%%lt8
+	mov	%%TMP0, [%%SRC]
+	mov	%%TMP1, [%%SRC + %%SIZE - 8]
+	mov	[%%DST], %%TMP0
+	mov	[%%DST + %%SIZE - 8], %%TMP1
+	%%DONE
+%endif
+
+%if (%%MAXSIZE >= 8)
+%%lt8:
+	test	%%SIZE, 4
+	jz	%%lt4
+	mov	DWORD(%%TMP0), [%%SRC]
+	mov	DWORD(%%TMP1), [%%SRC + %%SIZE - 4]
+	mov	[%%DST], DWORD(%%TMP0)
+	mov	[%%DST + %%SIZE - 4], DWORD(%%TMP1)
+	%%DONE
+%endif
+
+%if (%%MAXSIZE >= 4)
+%%lt4:
+	test	%%SIZE, 2
+	jz	%%lt2
+	movzx	DWORD(%%TMP0), word [%%SRC]
+	movzx	DWORD(%%TMP1), byte [%%SRC + %%SIZE - 1]
+	mov	[%%DST], WORD(%%TMP0)
+	mov	[%%DST + %%SIZE - 1], BYTE(%%TMP1)
+	%%DONE
+%endif
+
+%%lt2:
+%if (%%NOT0 == 0)
+	 test	 %%SIZE, 1
+	 jz	 %%end
+%endif
+	movzx	DWORD(%%TMP0), byte [%%SRC]
+	mov	[%%DST], BYTE(%%TMP0)
+%%end:
+%if (%%USERET != 0)
+	ret
+%endif
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Utility macro to assist with SIMD shifting
+%macro _PSRLDQ 3
+%define %%VEC   %1
+%define %%REG   %2
+%define %%IMM   %3
+
+%ifidn %%VEC, SSE
+        psrldq  %%REG, %%IMM
+%else
+        vpsrldq %%REG, %%REG, %%IMM
+%endif
+%endm
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; This section defines a series of macros to store small to medium amounts
+; of data from SIMD registers to memory, where the size is variable but limited.
+;
+; The macros are all called as:
+; memcpy DST, SRC, SIZE, TMP, IDX
+; with the parameters defined as:
+;    DST     : register: pointer to dst (not modified)
+;    SRC     : register: src data (clobbered)
+;    SIZE    : register: length in bytes (not modified)
+;    TMP     : 64-bit temp GPR (clobbered)
+;    IDX     : 64-bit GPR to store dst index/offset (clobbered)
+;
+; The name indicates the options. The name is of the form:
+; simd_store_<VEC>
+; where <VEC> is the SIMD instruction type e.g. "sse" or "avx"
+
+
+%macro simd_store_sse 5
+        __simd_store %1,%2,%3,%4,%5,SSE
+%endm
+
+%macro simd_store_avx 5
+        __simd_store %1,%2,%3,%4,%5,AVX
+%endm
+
+%macro simd_store_sse_15 5
+        __simd_store %1,%2,%3,%4,%5,SSE,15
+%endm
+
+%macro simd_store_avx_15 5
+        __simd_store %1,%2,%3,%4,%5,AVX,15
+%endm
+
+%macro __simd_store 6-7
+%define %%DST      %1    ; register: pointer to dst (not modified)
+%define %%SRC      %2    ; register: src data (clobbered)
+%define %%SIZE     %3    ; register: length in bytes (not modified)
+%define %%TMP      %4    ; 64-bit temp GPR (clobbered)
+%define %%IDX      %5    ; 64-bit temp GPR to store dst idx (clobbered)
+%define %%SIMDTYPE %6    ; "SSE" or "AVX"
+%define %%MAX_LEN  %7    ; [optional] maximum length to be stored, default 16
+
+%define %%PSRLDQ _PSRLDQ %%SIMDTYPE,
+
+%ifidn %%SIMDTYPE, SSE
+ %define %%MOVDQU movdqu
+ %define %%MOVQ movq
+%else
+ %define %%MOVDQU vmovdqu
+ %define %%MOVQ vmovq
+%endif
+
+;; determine max byte size for store operation
+%if %0 > 6
+%assign max_length_to_store %%MAX_LEN
+%else
+%assign max_length_to_store 16
+%endif
+
+%if max_length_to_store > 16
+%error "__simd_store macro invoked with MAX_LEN bigger than 16!"
+%endif
+
+        xor %%IDX, %%IDX        ; zero idx
+
+%if max_length_to_store == 16
+        test    %%SIZE, 16
+        jz      %%lt16
+        %%MOVDQU [%%DST], %%SRC
+        jmp     %%end
+%%lt16:
+%endif
+
+%if max_length_to_store >= 8
+        test    %%SIZE, 8
+        jz      %%lt8
+        %%MOVQ  [%%DST + %%IDX], %%SRC
+        %%PSRLDQ %%SRC, 8
+        add     %%IDX, 8
+%%lt8:
+%endif
+
+        %%MOVQ %%TMP, %%SRC     ; use GPR from now on
+
+%if max_length_to_store >= 4
+        test    %%SIZE, 4
+        jz      %%lt4
+        mov     [%%DST + %%IDX], DWORD(%%TMP)
+        shr     %%TMP, 32
+        add     %%IDX, 4
+%%lt4:
+%endif
+
+        test    %%SIZE, 2
+        jz      %%lt2
+        mov     [%%DST + %%IDX], WORD(%%TMP)
+        shr     %%TMP, 16
+        add     %%IDX, 2
+%%lt2:
+        test    %%SIZE, 1
+        jz      %%end
+        mov     [%%DST + %%IDX], BYTE(%%TMP)
+%%end:
+%endm
+
+; This section defines a series of macros to load small to medium amounts
+; (from 0 to 16 bytes) of data from memory to SIMD registers,
+; where the size is variable but limited.
+;
+; The macros are all called as:
+; simd_load DST, SRC, SIZE
+; with the parameters defined as:
+;    DST     : register: destination XMM register
+;    SRC     : register: pointer to src data (not modified)
+;    SIZE    : register: length in bytes (not modified)
+;
+; The name indicates the options. The name is of the form:
+; simd_load_<VEC>_<SZ><ZERO>
+; where:
+; <VEC> is either "sse" or "avx"
+; <SZ> is either "15" or "16" and defines largest value of SIZE
+; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
+;
+; For example:
+; simd_load_sse_16		: SSE, 0 <= size <= 16
+; simd_load_avx_15_1	        : AVX, 1 <= size <= 15
+
+%macro simd_load_sse_15_1 3
+        __simd_load %1,%2,%3,0,0,SSE
+%endm
+%macro simd_load_sse_15 3
+        __simd_load %1,%2,%3,1,0,SSE
+%endm
+%macro simd_load_sse_16_1 3
+        __simd_load %1,%2,%3,0,1,SSE
+%endm
+%macro simd_load_sse_16 3
+        __simd_load %1,%2,%3,1,1,SSE
+%endm
+
+%macro simd_load_avx_15_1 3
+        __simd_load %1,%2,%3,0,0,AVX
+%endm
+%macro simd_load_avx_15 3
+        __simd_load %1,%2,%3,1,0,AVX
+%endm
+%macro simd_load_avx_16_1 3
+        __simd_load %1,%2,%3,0,1,AVX
+%endm
+%macro simd_load_avx_16 3
+        __simd_load %1,%2,%3,1,1,AVX
+%endm
+
+%macro __simd_load 6
+%define %%DST       %1    ; [out] destination XMM register
+%define %%SRC       %2    ; [in] pointer to src data
+%define %%SIZE      %3    ; [in] length in bytes (0-16 bytes)
+%define %%ACCEPT_0  %4    ; 0 = min length = 1, 1 = min length = 0
+%define %%ACCEPT_16 %5    ; 0 = max length = 15 , 1 = max length = 16
+%define %%SIMDTYPE  %6    ; "SSE" or "AVX"
+
+%ifidn %%SIMDTYPE, SSE
+ %define %%MOVDQU movdqu
+ %define %%PINSRB pinsrb
+ %define %%PINSRQ pinsrq
+ %define %%PXOR   pxor
+%else
+ %define %%MOVDQU vmovdqu
+ %define %%PINSRB vpinsrb
+ %define %%PINSRQ vpinsrq
+ %define %%PXOR   vpxor
+%endif
+
+%if (%%ACCEPT_16 != 0)
+        test    %%SIZE, 16
+        jz      %%_skip_16
+        %%MOVDQU %%DST, [%%SRC]
+        jmp     %%end_load
+
+%%_skip_16:
+%endif
+        %%PXOR  %%DST, %%DST ; clear XMM register
+%if (%%ACCEPT_0 != 0)
+        or      %%SIZE, %%SIZE
+        je      %%end_load
+%endif
+        cmp     %%SIZE, 1
+        je      %%_size_1
+        cmp     %%SIZE, 2
+        je      %%_size_2
+        cmp     %%SIZE, 3
+        je      %%_size_3
+        cmp     %%SIZE, 4
+        je      %%_size_4
+        cmp     %%SIZE, 5
+        je      %%_size_5
+        cmp     %%SIZE, 6
+        je      %%_size_6
+        cmp     %%SIZE, 7
+        je      %%_size_7
+        cmp     %%SIZE, 8
+        je      %%_size_8
+        cmp     %%SIZE, 9
+        je      %%_size_9
+        cmp     %%SIZE, 10
+        je      %%_size_10
+        cmp     %%SIZE, 11
+        je      %%_size_11
+        cmp     %%SIZE, 12
+        je      %%_size_12
+        cmp     %%SIZE, 13
+        je      %%_size_13
+        cmp     %%SIZE, 14
+        je      %%_size_14
+
+%%_size_15:
+        %%PINSRB %%DST, [%%SRC + 14], 14
+%%_size_14:
+        %%PINSRB %%DST, [%%SRC + 13], 13
+%%_size_13:
+        %%PINSRB %%DST, [%%SRC + 12], 12
+%%_size_12:
+        %%PINSRB %%DST, [%%SRC + 11], 11
+%%_size_11:
+        %%PINSRB %%DST, [%%SRC + 10], 10
+%%_size_10:
+        %%PINSRB %%DST, [%%SRC + 9], 9
+%%_size_9:
+        %%PINSRB %%DST, [%%SRC + 8], 8
+%%_size_8:
+        %%PINSRQ %%DST, [%%SRC], 0
+        jmp    %%end_load
+%%_size_7:
+        %%PINSRB %%DST, [%%SRC + 6], 6
+%%_size_6:
+        %%PINSRB %%DST, [%%SRC + 5], 5
+%%_size_5:
+        %%PINSRB %%DST, [%%SRC + 4], 4
+%%_size_4:
+        %%PINSRB %%DST, [%%SRC + 3], 3
+%%_size_3:
+        %%PINSRB %%DST, [%%SRC + 2], 2
+%%_size_2:
+        %%PINSRB %%DST, [%%SRC + 1], 1
+%%_size_1:
+        %%PINSRB %%DST, [%%SRC + 0], 0
+%%end_load:
+%endm
+
+%endif ; ifndef __MEMCPY_ASM__
diff --git a/src/crypto/isa-l/isa-l_crypto/include/memcpy_inline.h b/src/crypto/isa-l/isa-l_crypto/include/memcpy_inline.h
new file mode 100644
index 000000000..e0cc314d1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/memcpy_inline.h
@@ -0,0 +1,375 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+/**
+ *  @file  memcpy_inline.h
+ *  @brief Defines intrinsic memcpy functions used by the new hashing API
+ *
+ */
+
+#ifndef _MEMCPY_H_
+#define _MEMCPY_H_
+
+#if defined(__i386__) || defined(__x86_64__) || defined( _M_X64) \
+	|| defined(_M_IX86)
+#include "intrinreg.h"
+#endif
+#include <string.h>
+#include <assert.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined( _M_X64) \
+	|| defined(_M_IX86)
+
+#define memcpy_varlen   memcpy_sse_varlen
+#define memcpy_fixedlen memcpy_sse_fixedlen
+
+#define memclr_varlen   memclr_sse_varlen
+#define memclr_fixedlen memclr_sse_fixedlen
+
+static inline void memcpy_lte32_sse_fixedlen(void* dst, const void* src, size_t nbytes);
+static inline void memcpy_gte16_sse_fixedlen(void* dst, const void* src, size_t nbytes);
+static inline void memcpy_sse_fixedlen      (void* dst, const void* src, size_t nbytes);
+
+static inline void memcpy_lte32_sse_varlen  (void* dst, const void* src, size_t nbytes);
+static inline void memcpy_gte16_sse_varlen  (void* dst, const void* src, size_t nbytes);
+static inline void memcpy_sse_varlen        (void* dst, const void* src, size_t nbytes);
+
+
+static inline void memclr_lte32_sse_fixedlen(void* dst, size_t nbytes);
+static inline void memclr_gte16_sse_fixedlen(void* dst, size_t nbytes);
+static inline void memclr_sse_fixedlen      (void* dst, size_t nbytes);
+
+static inline void memclr_lte32_sse_varlen  (void* dst, size_t nbytes);
+static inline void memclr_gte16_sse_varlen  (void* dst, size_t nbytes);
+static inline void memclr_sse_varlen        (void* dst, size_t nbytes);
+
+#define MEMCPY_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, src, nbytes) \
+	do { \
+		intrinreg##N head; \
+		intrinreg##N tail; \
+		assert(N <= nbytes && nbytes <= 2*N); \
+		if(N == 1 || (fixedwidth && nbytes==N) ) { \
+			head = load_intrinreg##N(src); \
+			store_intrinreg##N(dst, head); \
+		} \
+		else { \
+			head = load_intrinreg##N(src); \
+			tail = load_intrinreg##N((const void*)((const char*)src + (nbytes - N))); \
+			store_intrinreg##N(dst, head); \
+			store_intrinreg##N((void*)((char*)dst + (nbytes - N)), tail); \
+		} \
+	} while(0)
+
+#define MEMCLR_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, nbytes) \
+	do { \
+		const intrinreg##N zero = {0}; \
+		assert(N <= nbytes && nbytes <= 2*N); \
+		if(N == 1 || (fixedwidth && nbytes==N) ) { \
+			store_intrinreg##N(dst, zero); \
+		} \
+		else { \
+			store_intrinreg##N(dst, zero); \
+			store_intrinreg##N((void*)((char*)dst + (nbytes - N)), zero); \
+		} \
+	} while(0)
+
+// Define load/store functions uniformly.
+
+#define load_intrinreg16(src)  _mm_loadu_ps((const float*) src)
+#define store_intrinreg16(dst,val) _mm_storeu_ps((float*) dst, val)
+
+static inline intrinreg8 load_intrinreg8(const void *src)
+{
+	return *(intrinreg8 *) src;
+}
+
+static inline void store_intrinreg8(void *dst, intrinreg8 val)
+{
+	*(intrinreg8 *) dst = val;
+}
+
+static inline intrinreg4 load_intrinreg4(const void *src)
+{
+	return *(intrinreg4 *) src;
+}
+
+static inline void store_intrinreg4(void *dst, intrinreg4 val)
+{
+	*(intrinreg4 *) dst = val;
+}
+
+static inline intrinreg2 load_intrinreg2(const void *src)
+{
+	return *(intrinreg2 *) src;
+}
+
+static inline void store_intrinreg2(void *dst, intrinreg2 val)
+{
+	*(intrinreg2 *) dst = val;
+}
+
+static inline intrinreg1 load_intrinreg1(const void *src)
+{
+	return *(intrinreg1 *) src;
+}
+
+static inline void store_intrinreg1(void *dst, intrinreg1 val)
+{
+	*(intrinreg1 *) dst = val;
+}
+
+static inline void memcpy_gte16_sse_fixedlen(void *dst, const void *src, size_t nbytes)
+{
+	size_t i;
+	size_t j;
+	intrinreg16 pool[4];
+	size_t remaining_moves;
+	size_t tail_offset;
+	int do_tail;
+	assert(nbytes >= 16);
+
+	for (i = 0; i + 16 * 4 <= nbytes; i += 16 * 4) {
+		for (j = 0; j < 4; j++)
+			pool[j] =
+			    load_intrinreg16((const void *)((const char *)src + i + 16 * j));
+		for (j = 0; j < 4; j++)
+			store_intrinreg16((void *)((char *)dst + i + 16 * j), pool[j]);
+	}
+
+	remaining_moves = (nbytes - i) / 16;
+	tail_offset = nbytes - 16;
+	do_tail = (tail_offset & (16 - 1));
+
+	for (j = 0; j < remaining_moves; j++)
+		pool[j] = load_intrinreg16((const void *)((const char *)src + i + 16 * j));
+
+	if (do_tail)
+		pool[j] = load_intrinreg16((const void *)((const char *)src + tail_offset));
+
+	for (j = 0; j < remaining_moves; j++)
+		store_intrinreg16((void *)((char *)dst + i + 16 * j), pool[j]);
+
+	if (do_tail)
+		store_intrinreg16((void *)((char *)dst + tail_offset), pool[j]);
+}
+
+static inline void memclr_gte16_sse_fixedlen(void *dst, size_t nbytes)
+{
+	size_t i;
+	size_t j;
+	const intrinreg16 zero = { 0 };
+	size_t remaining_moves;
+	size_t tail_offset;
+	int do_tail;
+	assert(nbytes >= 16);
+
+	for (i = 0; i + 16 * 4 <= nbytes; i += 16 * 4)
+		for (j = 0; j < 4; j++)
+			store_intrinreg16((void *)((char *)dst + i + 16 * j), zero);
+
+	remaining_moves = (nbytes - i) / 16;
+	tail_offset = nbytes - 16;
+	do_tail = (tail_offset & (16 - 1));
+
+	for (j = 0; j < remaining_moves; j++)
+		store_intrinreg16((void *)((char *)dst + i + 16 * j), zero);
+
+	if (do_tail)
+		store_intrinreg16((void *)((char *)dst + tail_offset), zero);
+}
+
+static inline void memcpy_lte32_sse_fixedlen(void *dst, const void *src, size_t nbytes)
+{
+	assert(nbytes <= 32);
+	if (nbytes >= 16)
+		MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 1, dst, src, nbytes);
+	else if (nbytes >= 8)
+		MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 1, dst, src, nbytes);
+	else if (nbytes >= 4)
+		MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 1, dst, src, nbytes);
+	else if (nbytes >= 2)
+		MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 1, dst, src, nbytes);
+	else if (nbytes >= 1)
+		MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 1, dst, src, nbytes);
+}
+
+static inline void memclr_lte32_sse_fixedlen(void *dst, size_t nbytes)
+{
+	assert(nbytes <= 32);
+	if (nbytes >= 16)
+		MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 1, dst, nbytes);
+	else if (nbytes >= 8)
+		MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 1, dst, nbytes);
+	else if (nbytes >= 4)
+		MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 1, dst, nbytes);
+	else if (nbytes >= 2)
+		MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 1, dst, nbytes);
+	else if (nbytes >= 1)
+		MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 1, dst, nbytes);
+}
+
+static inline void memcpy_lte32_sse_varlen(void *dst, const void *src, size_t nbytes)
+{
+	assert(nbytes <= 32);
+	if (nbytes >= 16)
+		MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 0, dst, src, nbytes);
+	else if (nbytes >= 8)
+		MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 0, dst, src, nbytes);
+	else if (nbytes >= 4)
+		MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 0, dst, src, nbytes);
+	else if (nbytes >= 2)
+		MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 0, dst, src, nbytes);
+	else if (nbytes >= 1)
+		MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 0, dst, src, nbytes);
+}
+
+static inline void memclr_lte32_sse_varlen(void *dst, size_t nbytes)
+{
+	assert(nbytes <= 32);
+	if (nbytes >= 16)
+		MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 0, dst, nbytes);
+	else if (nbytes >= 8)
+		MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 0, dst, nbytes);
+	else if (nbytes >= 4)
+		MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 0, dst, nbytes);
+	else if (nbytes >= 2)
+		MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 0, dst, nbytes);
+	else if (nbytes >= 1)
+		MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 0, dst, nbytes);
+}
+
+static inline void memcpy_gte16_sse_varlen(void *dst, const void *src, size_t nbytes)
+{
+	size_t i = 0;
+	intrinreg16 tail;
+
+	assert(nbytes >= 16);
+
+	while (i + 128 <= nbytes) {
+		memcpy_gte16_sse_fixedlen((void *)((char *)dst + i),
+					  (const void *)((const char *)src + i), 128);
+		i += 128;
+	}
+	if (i + 64 <= nbytes) {
+		memcpy_gte16_sse_fixedlen((void *)((char *)dst + i),
+					  (const void *)((const char *)src + i), 64);
+		i += 64;
+	}
+	if (i + 32 <= nbytes) {
+		memcpy_gte16_sse_fixedlen((void *)((char *)dst + i),
+					  (const void *)((const char *)src + i), 32);
+		i += 32;
+	}
+	if (i + 16 <= nbytes) {
+		memcpy_gte16_sse_fixedlen((void *)((char *)dst + i),
+					  (const void *)((const char *)src + i), 16);
+	}
+
+	i = nbytes - 16;
+	tail = load_intrinreg16((const void *)((const char *)src + i));
+	store_intrinreg16((void *)((char *)dst + i), tail);
+}
+
+static inline void memclr_gte16_sse_varlen(void *dst, size_t nbytes)
+{
+	size_t i = 0;
+	const intrinreg16 zero = { 0 };
+
+	assert(nbytes >= 16);
+
+	while (i + 128 <= nbytes) {
+		memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 128);
+		i += 128;
+	}
+	if (i + 64 <= nbytes) {
+		memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 64);
+		i += 64;
+	}
+	if (i + 32 <= nbytes) {
+		memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 32);
+		i += 32;
+	}
+	if (i + 16 <= nbytes) {
+		memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 16);
+	}
+
+	i = nbytes - 16;
+	store_intrinreg16((void *)((char *)dst + i), zero);
+}
+
+static inline void memcpy_sse_fixedlen(void *dst, const void *src, size_t nbytes)
+{
+	if (nbytes >= 16)
+		memcpy_gte16_sse_fixedlen(dst, src, nbytes);
+	else
+		memcpy_lte32_sse_fixedlen(dst, src, nbytes);
+}
+
+static inline void memclr_sse_fixedlen(void *dst, size_t nbytes)
+{
+	if (nbytes >= 16)
+		memclr_gte16_sse_fixedlen(dst, nbytes);
+	else
+		memclr_lte32_sse_fixedlen(dst, nbytes);
+}
+
+static inline void memcpy_sse_varlen(void *dst, const void *src, size_t nbytes)
+{
+	if (nbytes >= 16)
+		memcpy_gte16_sse_varlen(dst, src, nbytes);
+	else
+		memcpy_lte32_sse_varlen(dst, src, nbytes);
+}
+
+static inline void memclr_sse_varlen(void *dst, size_t nbytes)
+{
+	if (nbytes >= 16)
+		memclr_gte16_sse_varlen(dst, nbytes);
+	else
+		memclr_lte32_sse_varlen(dst, nbytes);
+}
+#else
+#define memcpy_varlen   memcpy
+#define memcpy_fixedlen memcpy
+
+#define memclr_varlen(dst,n)   memset(dst,0,n)
+#define memclr_fixedlen(dst,n) memset(dst,0,n)
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __MEMCPY_H
diff --git a/src/crypto/isa-l/isa-l_crypto/include/mh_sha1.h b/src/crypto/isa-l/isa-l_crypto/include/mh_sha1.h
new file mode 100644
index 000000000..eac3be031
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/mh_sha1.h
@@ -0,0 +1,315 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA1_H_
+#define _MH_SHA1_H_
+
+/**
+ *  @file mh_sha1.h
+ *  @brief mh_sha1 function prototypes and structures
+ *
+ *  Interface for mh_sha1 functions
+ *
+ * <b> mh_sha1  Init-Update..Update-Finalize </b>
+ *
+ * This file defines the interface to optimized functions used in mh_sha1.
+ * The definition of multi-hash SHA1(mh_sha1, for short) is: Pad the buffer
+ * in SHA1 style until the total length is a multiple of 4*16*16
+ * (words-width * parallel-segments * block-size); Hash the buffer in
+ * parallel, generating digests of 4*16*5 (words-width*parallel-segments*
+ * digest-size); Treat the set of digests as another data buffer, and
+ * generate a final SHA1 digest for it.
+ *
+ *
+ * Example
+ * \code
+ * uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS];
+ * struct mh_sha1_ctx *ctx;
+ *
+ * ctx = malloc(sizeof(struct mh_sha1_ctx));
+ * mh_sha1_init(ctx);
+ * mh_sha1_update(ctx, buff, block_len);
+ * mh_sha1_finalize(ctx, mh_sha1_digest);
+ * \endcode
+ */
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+// External Interface Definition
+#define HASH_SEGS					16
+#define SHA1_BLOCK_SIZE					64
+#define MH_SHA1_BLOCK_SIZE   (HASH_SEGS * SHA1_BLOCK_SIZE)
+#define SHA1_DIGEST_WORDS				 5
+#define AVX512_ALIGNED					64
+
+/** @brief Holds info describing a single mh_sha1
+ *
+ * It is better to use heap to allocate this data structure to avoid stack overflow.
+ *
+*/
+struct mh_sha1_ctx {
+	uint32_t  mh_sha1_digest[SHA1_DIGEST_WORDS]; //!< the digest of multi-hash SHA1
+
+	uint64_t  total_length;
+	//!<  Parameters for update feature, describe the lengths of input buffers in bytes
+	uint8_t   partial_block_buffer [MH_SHA1_BLOCK_SIZE * 2];
+	//!<  Padding the tail of input data for SHA1
+	uint8_t   mh_sha1_interim_digests[sizeof(uint32_t) * SHA1_DIGEST_WORDS * HASH_SEGS];
+	//!<  Storing the SHA1 interim digests of  all 16 segments. Each time, it will be copied to stack for 64-byte alignment purpose.
+	uint8_t   frame_buffer[MH_SHA1_BLOCK_SIZE + AVX512_ALIGNED];
+	//!<  Re-structure sha1 block data from different segments to fit big endian. Use AVX512_ALIGNED for 64-byte alignment purpose.
+};
+
+/**
+ *  @enum mh_sha1_ctx_error
+ *  @brief CTX error flags
+ */
+enum mh_sha1_ctx_error{
+	MH_SHA1_CTX_ERROR_NONE			=  0, //!< MH_SHA1_CTX_ERROR_NONE
+	MH_SHA1_CTX_ERROR_NULL			= -1, //!< MH_SHA1_CTX_ERROR_NULL
+};
+
+
+/*******************************************************************
+ * mh_sha1 API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the mh_sha1_ctx structure.
+ *
+ * @param  ctx Structure holding mh_sha1 info
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha1_init (struct mh_sha1_ctx* ctx);
+
+/**
+ * @brief Multi-hash sha1 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param  ctx Structure holding mh_sha1 info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha1_update (struct mh_sha1_ctx * ctx, const void* buffer, uint32_t len);
+
+/**
+ * @brief Finalize the message digests for multi-hash sha1.
+ *
+ * Place the message digest in mh_sha1_digest which must have enough space
+ * for the outputs.
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param   ctx Structure holding mh_sha1 info
+ * @param   mh_sha1_digest The digest of mh_sha1
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha1_finalize (struct mh_sha1_ctx* ctx, void* mh_sha1_digest);
+
+/*******************************************************************
+ * multi-types of mh_sha1 internal API
+ *
+ * XXXX		The multi-binary version
+ * XXXX_base	The C code version which used to display the algorithm
+ * XXXX_sse	The version uses a ASM function optimized for SSE
+ * XXXX_avx	The version uses a ASM function optimized for AVX
+ * XXXX_avx2	The version uses a ASM function optimized for AVX2
+ * XXXX_avx512	The version uses a ASM function optimized for AVX512
+ *
+ ******************************************************************/
+
+/**
+ * @brief Multi-hash sha1 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * Base update() function that does not require SIMD support.
+ *
+ * @param   ctx Structure holding mh_sha1 info
+ * @param   buffer Pointer to buffer to be processed
+ * @param   len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_update_base (struct mh_sha1_ctx* ctx, const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha1 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires SSE
+ *
+ * @param   ctx Structure holding mh_sha1 info
+ * @param   buffer Pointer to buffer to be processed
+ * @param   len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_update_sse (struct mh_sha1_ctx * ctx,
+						const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha1 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX
+ *
+ * @param   ctx Structure holding mh_sha1 info
+ * @param   buffer Pointer to buffer to be processed
+ * @param   len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_update_avx (struct mh_sha1_ctx * ctx,
+						const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha1 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX2
+ *
+ * @param   ctx Structure holding mh_sha1 info
+ * @param   buffer Pointer to buffer to be processed
+ * @param   len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_update_avx2 (struct mh_sha1_ctx * ctx,
+						const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha1 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX512
+ *
+ * @param   ctx Structure holding mh_sha1 info
+ * @param   buffer Pointer to buffer to be processed
+ * @param   len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_update_avx512 (struct mh_sha1_ctx * ctx,
+						const void* buffer, uint32_t len);
+
+
+/**
+  * @brief Finalize the message digests for multi-hash sha1.
+ *
+ * Place the message digests in mh_sha1_digest,
+ * which must have enough space for the outputs.
+ * Base Finalize() function that does not require SIMD support.
+ *
+ * @param   ctx Structure holding mh_sha1 info
+ * @param   mh_sha1_digest The digest of mh_sha1
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_finalize_base (struct mh_sha1_ctx* ctx,
+						void* mh_sha1_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha1_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires SSE
+ *
+ * @param   ctx Structure holding mh_sha1 info
+ * @param   mh_sha1_digest The digest of mh_sha1
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_finalize_sse (struct mh_sha1_ctx* ctx,
+						void* mh_sha1_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha1_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires AVX
+ *
+ * @param   ctx Structure holding mh_sha1 info
+ * @param   mh_sha1_digest The digest of mh_sha1
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_finalize_avx (struct mh_sha1_ctx* ctx,
+						void* mh_sha1_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha1_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires AVX2
+ *
+ * @param   ctx Structure holding mh_sha1 info
+ * @param   mh_sha1_digest The digest of mh_sha1
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_finalize_avx2 (struct mh_sha1_ctx* ctx,
+						void* mh_sha1_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha1_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires AVX512
+ *
+ * @param   ctx Structure holding mh_sha1 info
+ * @param   mh_sha1_digest The digest of mh_sha1
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_finalize_avx512 (struct mh_sha1_ctx* ctx,
+						void* mh_sha1_digest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/crypto/isa-l/isa-l_crypto/include/mh_sha1_murmur3_x64_128.h b/src/crypto/isa-l/isa-l_crypto/include/mh_sha1_murmur3_x64_128.h
new file mode 100644
index 000000000..1c07306ec
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/mh_sha1_murmur3_x64_128.h
@@ -0,0 +1,327 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA1_MURMUR3_X64_128_H_
+#define _MH_SHA1_MURMUR3_X64_128_H_
+
+/**
+ *  @file mh_sha1_murmur3_x64_128.h
+ *  @brief mh_sha1_murmur3_x64_128 function prototypes and structures
+ *
+ *  Interface for mh_sha1_murmur3_x64_128 functions
+ *
+ * <b> mh_sha1_murmur3_x64_128  Init-Update..Update-Finalize </b>
+ *
+ * This file defines the interface to optimized functions used in mh_sha1 and
+ * mh_sha1_murmur3_x64_128.  The definition of multi-hash SHA1(mh_sha1,
+ * for short) is: Pad the buffer in SHA1 style until the total length is a multiple
+ * of 4*16*16(words-width * parallel-segments * block-size); Hash the buffer
+ * in parallel, generating digests of 4*16*5 (words-width*parallel-segments*
+ * digest-size); Treat the set of digests as another data buffer, and generate
+ * a final SHA1 digest for it. mh_sha1_murmur3_x64_128 is a stitching function
+ * which will get a murmur3_x64_128 digest while generate mh_sha1 digest.
+ *
+ *
+ * Example
+ * \code
+ * uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS];
+ * uint32_t murmur_digest[MURMUR3_x64_128_DIGEST_WORDS];
+ * struct mh_sha1_murmur3_x64_128_ctx *ctx;
+ *
+ * ctx = malloc(sizeof(struct mh_sha1_murmur3_x64_128_ctx));
+ * mh_sha1_murmur3_x64_128_init(ctx, 0);
+ * mh_sha1_murmur3_x64_128_update(ctx, buff, block_len);
+ * mh_sha1_murmur3_x64_128_finalize(ctx, mh_sha1_digest,
+ * murmur_digest);
+ * \endcode
+ */
+
+#include <stdint.h>
+#include "mh_sha1.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+// External Interface Definition
+// Add murmur3_x64_128 definition
+#define MUR_BLOCK_SIZE		    (2 * sizeof(uint64_t))
+#define MURMUR3_x64_128_DIGEST_WORDS			 4
+
+/** @brief Holds info describing a single mh_sha1_murmur3_x64_128
+ *
+ * It is better to use heap to allocate this data structure to avoid stack overflow.
+ *
+*/
+struct mh_sha1_murmur3_x64_128_ctx {
+	uint32_t  mh_sha1_digest[SHA1_DIGEST_WORDS]; //!< the digest of multi-hash SHA1
+	uint32_t  murmur3_x64_128_digest[MURMUR3_x64_128_DIGEST_WORDS]; //!< the digest of murmur3_x64_128
+
+	uint64_t  total_length;
+	//!<  Parameters for update feature, describe the lengths of input buffers in bytes
+	uint8_t   partial_block_buffer [MH_SHA1_BLOCK_SIZE * 2];
+	//!<  Padding the tail of input data for SHA1
+	uint8_t   mh_sha1_interim_digests[sizeof(uint32_t) * SHA1_DIGEST_WORDS * HASH_SEGS];
+	//!<  Storing the SHA1 interim digests of  all 16 segments. Each time, it will be copied to stack for 64-byte alignment purpose.
+	uint8_t   frame_buffer[MH_SHA1_BLOCK_SIZE + AVX512_ALIGNED];
+	//!<  Re-structure sha1 block data from different segments to fit big endian. Use AVX512_ALIGNED for 64-byte alignment purpose.
+};
+
+/**
+ *  @enum mh_sha1_murmur3_ctx_error
+ *  @brief CTX error flags
+ */
+enum mh_sha1_murmur3_ctx_error{
+	MH_SHA1_MURMUR3_CTX_ERROR_NONE			=  0, //!< MH_SHA1_MURMUR3_CTX_ERROR_NONE
+	MH_SHA1_MURMUR3_CTX_ERROR_NULL			= -1, //!<MH_SHA1_MURMUR3_CTX_ERROR_NULL
+};
+
+
+/*******************************************************************
+ * mh_sha1_murmur3_x64_128 API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the mh_sha1_murmur3_x64_128_ctx structure.
+ *
+ * @param  ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param  murmur_seed Seed as an initial digest of murmur3
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha1_murmur3_x64_128_init (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+					uint64_t murmur_seed);
+
+/**
+ * @brief Combined multi-hash and murmur hash update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param  ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha1_murmur3_x64_128_update (struct mh_sha1_murmur3_x64_128_ctx * ctx,
+					const void* buffer, uint32_t len);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digests in mh_sha1_digest and murmur3_x64_128_digest,
+ * which must have enough space for the outputs.
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param  ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param  mh_sha1_digest The digest of mh_sha1
+ * @param  murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha1_murmur3_x64_128_finalize (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+					void* mh_sha1_digest, void* murmur3_x64_128_digest);
+
+/*******************************************************************
+ * multi-types of mh_sha1_murmur3_x64_128 internal API
+ *
+ * XXXX		The multi-binary version
+ * XXXX_base	The C code version which used to display the algorithm
+ * XXXX_sse	The version uses a ASM function optimized for SSE
+ * XXXX_avx	The version uses a ASM function optimized for AVX
+ * XXXX_avx2	The version uses a ASM function optimized for AVX2
+ *
+ ******************************************************************/
+
+/**
+ * @brief Combined multi-hash and murmur hash update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * Base update() function that does not require SIMD support.
+ *
+ * @param  ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_update_base (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+						const void* buffer, uint32_t len);
+
+/**
+ * @brief Combined multi-hash and murmur hash update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires SSE
+ *
+ * @param  ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_update_sse (struct mh_sha1_murmur3_x64_128_ctx * ctx,
+						const void* buffer, uint32_t len);
+
+/**
+ * @brief Combined multi-hash and murmur hash update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX
+ *
+ * @param  ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_update_avx (struct mh_sha1_murmur3_x64_128_ctx * ctx,
+						const void* buffer, uint32_t len);
+
+/**
+ * @brief Combined multi-hash and murmur hash update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX2
+ *
+ * @param  ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_update_avx2 (struct mh_sha1_murmur3_x64_128_ctx * ctx,
+						const void* buffer, uint32_t len);
+
+/**
+ * @brief Combined multi-hash and murmur hash update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX512
+ *
+ * @param  ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_update_avx512 (struct mh_sha1_murmur3_x64_128_ctx * ctx,
+						const void* buffer, uint32_t len);
+
+/**
+  * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digests in mh_sha1_digest and murmur3_x64_128_digest,
+ * which must have enough space for the outputs.
+ * Base Finalize() function that does not require SIMD support.
+ *
+ * @param  ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param  mh_sha1_digest The digest of mh_sha1
+ * @param  murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_finalize_base (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+						void* mh_sha1_digest, void* murmur3_x64_128_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digests in mh_sha1_digest and murmur3_x64_128_digest,
+ * which must have enough space for the outputs.
+ *
+ * @requires SSE
+ *
+ * @param  ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param  mh_sha1_digest The digest of mh_sha1
+ * @param  murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_finalize_sse (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+						void* mh_sha1_digest, void* murmur3_x64_128_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digests in mh_sha1_digest and murmur3_x64_128_digest,
+ * which must have enough space for the outputs.
+ *
+ * @requires AVX
+ *
+ * @param  ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param  mh_sha1_digest The digest of mh_sha1
+ * @param  murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_finalize_avx (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+						void* mh_sha1_digest, void* murmur3_x64_128_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digests in mh_sha1_digest and murmur3_x64_128_digest,
+ * which must have enough space for the outputs.
+ *
+ * @requires AVX2
+ *
+ * @param  ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param  mh_sha1_digest The digest of mh_sha1
+ * @param  murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_finalize_avx2 (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+						void* mh_sha1_digest, void* murmur3_x64_128_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digests in mh_sha1_digest and murmur3_x64_128_digest,
+ * which must have enough space for the outputs.
+ *
+ * @requires AVX512
+ *
+ * @param  ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param  mh_sha1_digest The digest of mh_sha1
+ * @param  murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_finalize_avx512 (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+						void* mh_sha1_digest, void* murmur3_x64_128_digest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/crypto/isa-l/isa-l_crypto/include/mh_sha256.h b/src/crypto/isa-l/isa-l_crypto/include/mh_sha256.h
new file mode 100644
index 000000000..4a453833e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/mh_sha256.h
@@ -0,0 +1,315 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA256_H_
+#define _MH_SHA256_H_
+
+/**
+ *  @file mh_sha256.h
+ *  @brief mh_sha256 function prototypes and structures
+ *
+ *  Interface for mh_sha256 functions
+ *
+ * <b> mh_sha256  Init-Update..Update-Finalize </b>
+ *
+ * This file defines the interface to optimized functions used in mh_sha256.
+ * The definition of multi-hash SHA256(mh_sha256, for short) is: Pad the buffer
+ * in SHA256 style until the total length is a multiple of 4*16*16
+ * (words-width * parallel-segments * block-size); Hash the buffer in
+ * parallel, generating digests of 4*16*8 (words-width*parallel-segments*
+ * digest-size); Treat the set of digests as another data buffer, and
+ * generate a final SHA256 digest for it.
+ *
+ *
+ * Example
+ * \code
+ * uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS];
+ * struct mh_sha256_ctx *ctx;
+ *
+ * ctx = malloc(sizeof(struct mh_sha256_ctx));
+ * mh_sha256_init(ctx);
+ * mh_sha256_update(ctx, buff, block_len);
+ * mh_sha256_finalize(ctx, mh_sha256_digest);
+ * \endcode
+ */
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+// External Interface Definition
+#define HASH_SEGS					16
+#define SHA256_BLOCK_SIZE				64
+#define MH_SHA256_BLOCK_SIZE   (HASH_SEGS * SHA256_BLOCK_SIZE)
+#define SHA256_DIGEST_WORDS				 8
+#define AVX512_ALIGNED					64
+
+/** @brief Holds info describing a single mh_sha256
+ *
+ * It is better to use heap to allocate this data structure to avoid stack overflow.
+ *
+*/
+struct mh_sha256_ctx {
+	uint32_t  mh_sha256_digest[SHA256_DIGEST_WORDS]; //!< the digest of multi-hash SHA256
+
+	uint64_t  total_length;
+	//!<  Parameters for update feature, describe the lengths of input buffers in bytes
+	uint8_t   partial_block_buffer [MH_SHA256_BLOCK_SIZE * 2];
+	//!<  Padding the tail of input data for SHA256
+	uint8_t   mh_sha256_interim_digests[sizeof(uint32_t) * SHA256_DIGEST_WORDS * HASH_SEGS];
+	//!<  Storing the SHA256 interim digests of  all 16 segments. Each time, it will be copied to stack for 64-byte alignment purpose.
+	uint8_t   frame_buffer[MH_SHA256_BLOCK_SIZE + AVX512_ALIGNED];
+	//!<  Re-structure sha256 block data from different segments to fit big endian. Use AVX512_ALIGNED for 64-byte alignment purpose.
+};
+
+/**
+ *  @enum mh_sha256_ctx_error
+ *  @brief CTX error flags
+ */
+enum mh_sha256_ctx_error{
+	MH_SHA256_CTX_ERROR_NONE			=  0, //!< MH_SHA256_CTX_ERROR_NONE
+	MH_SHA256_CTX_ERROR_NULL			= -1, //!< MH_SHA256_CTX_ERROR_NULL
+};
+
+
+/*******************************************************************
+ * mh_sha256 API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the mh_sha256_ctx structure.
+ *
+ * @param  ctx Structure holding mh_sha256 info
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha256_init (struct mh_sha256_ctx* ctx);
+
+/**
+ * @brief Multi-hash sha256 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param  ctx Structure holding mh_sha256 info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha256_update (struct mh_sha256_ctx * ctx, const void* buffer, uint32_t len);
+
+/**
+ * @brief Finalize the message digests for multi-hash sha256.
+ *
+ * Place the message digest in mh_sha256_digest which must have enough space
+ * for the outputs.
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param   ctx Structure holding mh_sha256 info
+ * @param   mh_sha256_digest The digest of mh_sha256
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha256_finalize (struct mh_sha256_ctx* ctx, void* mh_sha256_digest);
+
+/*******************************************************************
+ * multi-types of mh_sha256 internal API
+ *
+ * XXXX		The multi-binary version
+ * XXXX_base	The C code version which used to display the algorithm
+ * XXXX_sse	The version uses a ASM function optimized for SSE
+ * XXXX_avx	The version uses a ASM function optimized for AVX
+ * XXXX_avx2	The version uses a ASM function optimized for AVX2
+ * XXXX_avx512	The version uses a ASM function optimized for AVX512
+ *
+ ******************************************************************/
+
+/**
+ * @brief Multi-hash sha256 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * Base update() function that does not require SIMD support.
+ *
+ * @param   ctx Structure holding mh_sha256 info
+ * @param   buffer Pointer to buffer to be processed
+ * @param   len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_update_base (struct mh_sha256_ctx* ctx, const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha256 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires SSE
+ *
+ * @param   ctx Structure holding mh_sha256 info
+ * @param   buffer Pointer to buffer to be processed
+ * @param   len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_update_sse (struct mh_sha256_ctx * ctx,
+						const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha256 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX
+ *
+ * @param   ctx Structure holding mh_sha256 info
+ * @param   buffer Pointer to buffer to be processed
+ * @param   len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_update_avx (struct mh_sha256_ctx * ctx,
+						const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha256 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX2
+ *
+ * @param   ctx Structure holding mh_sha256 info
+ * @param   buffer Pointer to buffer to be processed
+ * @param   len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_update_avx2 (struct mh_sha256_ctx * ctx,
+						const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha256 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX512
+ *
+ * @param   ctx Structure holding mh_sha256 info
+ * @param   buffer Pointer to buffer to be processed
+ * @param   len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_update_avx512 (struct mh_sha256_ctx * ctx,
+						const void* buffer, uint32_t len);
+
+
+/**
+  * @brief Finalize the message digests for multi-hash sha256.
+ *
+ * Place the message digests in mh_sha256_digest,
+ * which must have enough space for the outputs.
+ * Base Finalize() function that does not require SIMD support.
+ *
+ * @param   ctx Structure holding mh_sha256 info
+ * @param   mh_sha256_digest The digest of mh_sha256
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_finalize_base (struct mh_sha256_ctx* ctx,
+						void* mh_sha256_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha256_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires SSE
+ *
+ * @param   ctx Structure holding mh_sha256 info
+ * @param   mh_sha256_digest The digest of mh_sha256
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_finalize_sse (struct mh_sha256_ctx* ctx,
+						void* mh_sha256_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha256_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires AVX
+ *
+ * @param   ctx Structure holding mh_sha256 info
+ * @param   mh_sha256_digest The digest of mh_sha256
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_finalize_avx (struct mh_sha256_ctx* ctx,
+						void* mh_sha256_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha256_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires AVX2
+ *
+ * @param   ctx Structure holding mh_sha256 info
+ * @param   mh_sha256_digest The digest of mh_sha256
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_finalize_avx2 (struct mh_sha256_ctx* ctx,
+						void* mh_sha256_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha256_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires AVX512
+ *
+ * @param   ctx Structure holding mh_sha256 info
+ * @param   mh_sha256_digest The digest of mh_sha256
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_finalize_avx512 (struct mh_sha256_ctx* ctx,
+						void* mh_sha256_digest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/crypto/isa-l/isa-l_crypto/include/multi_buffer.h b/src/crypto/isa-l/isa-l_crypto/include/multi_buffer.h
new file mode 100644
index 000000000..ac88f7b0a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/multi_buffer.h
@@ -0,0 +1,112 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MULTI_BUFFER_H_
+#define _MULTI_BUFFER_H_
+
+/**
+ *  @file  multi_buffer.h
+ *  @brief Multi-buffer common fields
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ *  @enum JOB_STS
+ *  @brief Job return codes
+ */
+
+typedef enum {STS_UNKNOWN = 0,	//!< STS_UNKNOWN
+	STS_BEING_PROCESSED = 1,//!< STS_BEING_PROCESSED
+	STS_COMPLETED = 2,	//!< STS_COMPLETED
+	STS_INTERNAL_ERROR,	//!< STS_INTERNAL_ERROR
+	STS_ERROR		//!< STS_ERROR
+} JOB_STS;
+
+#define HASH_MB_NO_FLAGS 0
+#define HASH_MB_FIRST	1
+#define HASH_MB_LAST	2
+
+/* Common flags for the new API only
+ *  */
+
+/**
+ *  @enum HASH_CTX_FLAG
+ *  @brief CTX job type
+ */
+typedef enum {
+	HASH_UPDATE	= 0x00, //!< HASH_UPDATE
+	HASH_FIRST	= 0x01, //!< HASH_FIRST
+	HASH_LAST	= 0x02, //!< HASH_LAST
+	HASH_ENTIRE	= 0x03, //!< HASH_ENTIRE
+} HASH_CTX_FLAG;
+
+/**
+ *  @enum HASH_CTX_STS
+ *  @brief CTX status flags
+ */
+typedef enum {
+	HASH_CTX_STS_IDLE	= 0x00, //!< HASH_CTX_STS_IDLE
+	HASH_CTX_STS_PROCESSING	= 0x01, //!< HASH_CTX_STS_PROCESSING
+	HASH_CTX_STS_LAST	= 0x02, //!< HASH_CTX_STS_LAST
+	HASH_CTX_STS_COMPLETE	= 0x04, //!< HASH_CTX_STS_COMPLETE
+} HASH_CTX_STS;
+
+/**
+ *  @enum HASH_CTX_ERROR
+ *  @brief CTX error flags
+ */
+typedef enum {
+	HASH_CTX_ERROR_NONE			=  0, //!< HASH_CTX_ERROR_NONE
+	HASH_CTX_ERROR_INVALID_FLAGS		= -1, //!< HASH_CTX_ERROR_INVALID_FLAGS
+	HASH_CTX_ERROR_ALREADY_PROCESSING	= -2, //!< HASH_CTX_ERROR_ALREADY_PROCESSING
+	HASH_CTX_ERROR_ALREADY_COMPLETED	= -3, //!< HASH_CTX_ERROR_ALREADY_COMPLETED
+} HASH_CTX_ERROR;
+
+
+#define hash_ctx_user_data(ctx)  ((ctx)->user_data)
+#define hash_ctx_digest(ctx)     ((ctx)->job.result_digest)
+#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
+#define hash_ctx_complete(ctx)   ((ctx)->status == HASH_CTX_STS_COMPLETE)
+#define hash_ctx_status(ctx)     ((ctx)->status)
+#define hash_ctx_error(ctx)      ((ctx)->error)
+#define hash_ctx_init(ctx) \
+	do { \
+		(ctx)->error = HASH_CTX_ERROR_NONE; \
+		(ctx)->status = HASH_CTX_STS_COMPLETE; \
+	} while(0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _MULTI_BUFFER_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/multibinary.asm b/src/crypto/isa-l/isa-l_crypto/include/multibinary.asm
new file mode 100644
index 000000000..4dd019319
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/multibinary.asm
@@ -0,0 +1,517 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef _MULTIBINARY_ASM_
+%define _MULTIBINARY_ASM_
+
+%ifidn __OUTPUT_FORMAT__, elf32
+ %define mbin_def_ptr	dd
+ %define mbin_ptr_sz	dword
+ %define mbin_rdi	edi
+ %define mbin_rsi	esi
+ %define mbin_rax	eax
+ %define mbin_rbx	ebx
+ %define mbin_rcx	ecx
+ %define mbin_rdx	edx
+%else
+ %define mbin_def_ptr	dq
+ %define mbin_ptr_sz	qword
+ %define mbin_rdi	rdi
+ %define mbin_rsi	rsi
+ %define mbin_rax	rax
+ %define mbin_rbx	rbx
+ %define mbin_rcx	rcx
+ %define mbin_rdx	rdx
+%endif
+
+%ifndef AS_FEATURE_LEVEL
+%define AS_FEATURE_LEVEL 4
+%endif
+
+;;;;
+; multibinary macro:
+;   creates the visable entry point that uses HW optimized call pointer
+;   creates the init of the HW optimized call pointer
+;;;;
+%macro mbin_interface 1
+	;;;;
+	; *_dispatched is defaulted to *_mbinit and replaced on first call.
+	; Therefore, *_dispatch_init is only executed on first call.
+	;;;;
+	section .data
+	%1_dispatched:
+		mbin_def_ptr	%1_mbinit
+
+	section .text
+	mk_global %1, function
+	%1_mbinit:
+		;;; only called the first time to setup hardware match
+		call	%1_dispatch_init
+		;;; falls thru to execute the hw optimized code
+	%1:
+		jmp	mbin_ptr_sz [%1_dispatched]
+%endmacro
+
+;;;;;
+; mbin_dispatch_init parameters
+; Use this function when SSE/00/01 is a minimum requirement
+; 1-> function name
+; 2-> SSE/00/01 optimized function used as base
+; 3-> AVX or AVX/02 opt func
+; 4-> AVX2 or AVX/04 opt func
+;;;;;
+%macro mbin_dispatch_init 4
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		push	mbin_rax
+		push	mbin_rbx
+		push	mbin_rcx
+		push	mbin_rdx
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
+
+		mov	eax, 1
+		cpuid
+		and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+		cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+		lea	mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
+		jne	_%1_init_done ; AVX is not available so end
+		mov	mbin_rsi, mbin_rbx
+
+		;; Try for AVX2
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_AVX2
+		lea	mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
+		cmovne	mbin_rsi, mbin_rbx
+
+		;; Does it have xmm and ymm support
+		xor	ecx, ecx
+		xgetbv
+		and	eax, FLAG_XGETBV_EAX_XMM_YMM
+		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+		je	_%1_init_done
+		lea	mbin_rsi, [%2 WRT_OPT]
+
+	_%1_init_done:
+		pop	mbin_rdx
+		pop	mbin_rcx
+		pop	mbin_rbx
+		pop	mbin_rax
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init2 parameters
+;  Cases where only base functions are available
+; 1-> function name
+; 2-> base function
+;;;;;
+%macro mbin_dispatch_init2 2
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init5 parameters
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_1 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+;;;;;
+%macro mbin_dispatch_init5 5
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		push	mbin_rax
+		push	mbin_rbx
+		push	mbin_rcx
+		push	mbin_rdx
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+		mov	eax, 1
+		cpuid
+		; Test for SSE4.1
+		test	ecx, FLAG_CPUID1_ECX_SSE4_1
+		lea	mbin_rbx, [%3 WRT_OPT] ; SSE opt func
+		cmovne	mbin_rsi, mbin_rbx
+
+		and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+		cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+		lea	mbin_rbx, [%4 WRT_OPT] ; AVX (gen2) opt func
+		jne	_%1_init_done ; AVX is not available so end
+		mov	mbin_rsi, mbin_rbx
+
+		;; Try for AVX2
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_AVX2
+		lea	mbin_rbx, [%5 WRT_OPT] ; AVX (gen4) opt func
+		cmovne	mbin_rsi, mbin_rbx
+
+		;; Does it have xmm and ymm support
+		xor	ecx, ecx
+		xgetbv
+		and	eax, FLAG_XGETBV_EAX_XMM_YMM
+		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+		je	_%1_init_done
+		lea	mbin_rsi, [%3 WRT_OPT]
+
+	_%1_init_done:
+		pop	mbin_rdx
+		pop	mbin_rcx
+		pop	mbin_rbx
+		pop	mbin_rax
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+%endmacro
+
+%if AS_FEATURE_LEVEL >= 6
+;;;;;
+; mbin_dispatch_init6 parameters
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_1 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+; 6-> AVX512/06 opt func
+;;;;;
+%macro mbin_dispatch_init6 6
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		push	mbin_rax
+		push	mbin_rbx
+		push	mbin_rcx
+		push	mbin_rdx
+		push	mbin_rdi
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+		mov	eax, 1
+		cpuid
+		mov	ebx, ecx ; save cpuid1.ecx
+		test	ecx, FLAG_CPUID1_ECX_SSE4_1
+		je	_%1_init_done	  ; Use base function if no SSE4_1
+		lea	mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
+
+		;; Test for XMM_YMM support/AVX
+		test	ecx, FLAG_CPUID1_ECX_OSXSAVE
+		je	_%1_init_done
+		xor	ecx, ecx
+		xgetbv	; xcr -> edx:eax
+		mov	edi, eax	  ; save xgetvb.eax
+
+		and	eax, FLAG_XGETBV_EAX_XMM_YMM
+		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+		jne	_%1_init_done
+		test	ebx, FLAG_CPUID1_ECX_AVX
+		je	_%1_init_done
+		lea	mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
+
+		;; Test for AVX2
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_AVX2
+		je	_%1_init_done		; No AVX2 possible
+		lea	mbin_rsi, [%5 WRT_OPT] 	; AVX2/04 opt func
+
+		;; Test for AVX512
+		and	edi, FLAG_XGETBV_EAX_ZMM_OPM
+		cmp	edi, FLAG_XGETBV_EAX_ZMM_OPM
+		jne	_%1_init_done	  ; No AVX512 possible
+		and	ebx, FLAGS_CPUID7_EBX_AVX512_G1
+		cmp	ebx, FLAGS_CPUID7_EBX_AVX512_G1
+		lea	mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
+		cmove	mbin_rsi, mbin_rbx
+
+	_%1_init_done:
+		pop	mbin_rdi
+		pop	mbin_rdx
+		pop	mbin_rcx
+		pop	mbin_rbx
+		pop	mbin_rax
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+%endmacro
+
+%else
+%macro mbin_dispatch_init6 6
+	mbin_dispatch_init5 %1, %2, %3, %4, %5
+%endmacro
+%endif
+
+%if AS_FEATURE_LEVEL >= 10
+;;;;;
+; mbin_dispatch_init7 parameters
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_2 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+; 6-> AVX512/06 opt func
+; 7-> AVX512 Update/10 opt func
+;;;;;
+%macro mbin_dispatch_init7 7
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		push	mbin_rax
+		push	mbin_rbx
+		push	mbin_rcx
+		push	mbin_rdx
+		push	mbin_rdi
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+		mov	eax, 1
+		cpuid
+		mov	ebx, ecx ; save cpuid1.ecx
+		test	ecx, FLAG_CPUID1_ECX_SSE4_2
+		je	_%1_init_done	  ; Use base function if no SSE4_2
+		lea	mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
+
+		;; Test for XMM_YMM support/AVX
+		test	ecx, FLAG_CPUID1_ECX_OSXSAVE
+		je	_%1_init_done
+		xor	ecx, ecx
+		xgetbv	; xcr -> edx:eax
+		mov	edi, eax	  ; save xgetvb.eax
+
+		and	eax, FLAG_XGETBV_EAX_XMM_YMM
+		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+		jne	_%1_init_done
+		test	ebx, FLAG_CPUID1_ECX_AVX
+		je	_%1_init_done
+		lea	mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
+
+		;; Test for AVX2
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_AVX2
+		je	_%1_init_done		; No AVX2 possible
+		lea	mbin_rsi, [%5 WRT_OPT] 	; AVX2/04 opt func
+
+		;; Test for AVX512
+		and	edi, FLAG_XGETBV_EAX_ZMM_OPM
+		cmp	edi, FLAG_XGETBV_EAX_ZMM_OPM
+		jne	_%1_init_done	  ; No AVX512 possible
+		and	ebx, FLAGS_CPUID7_EBX_AVX512_G1
+		cmp	ebx, FLAGS_CPUID7_EBX_AVX512_G1
+		lea	mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
+		cmove	mbin_rsi, mbin_rbx
+
+		and	ecx, FLAGS_CPUID7_ECX_AVX512_G2
+		cmp	ecx, FLAGS_CPUID7_ECX_AVX512_G2
+		lea	mbin_rbx, [%7 WRT_OPT] ; AVX512/06 opt
+		cmove	mbin_rsi, mbin_rbx
+
+	_%1_init_done:
+		pop	mbin_rdi
+		pop	mbin_rdx
+		pop	mbin_rcx
+		pop	mbin_rbx
+		pop	mbin_rax
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+%endmacro
+%else
+%macro mbin_dispatch_init7 7
+	mbin_dispatch_init6 %1, %2, %3, %4, %5, %6
+%endmacro
+%endif
+
+;;;;;
+; mbin_dispatch_sse_to_avx2_shani parameters
+; derived from mbin_dispatch_init
+; Use this function when SSE/00/01 is a minimum requirement
+; 1-> function name
+; 2-> SSE/00/01 optimized function used as base
+; 3-> AVX or AVX/02 opt func
+; 4-> AVX2 or AVX/04 opt func
+; 5-> SHANI opt for GLM
+;;;;;
+%macro mbin_dispatch_sse_to_avx2_shani 5
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		push	mbin_rax
+		push	mbin_rbx
+		push	mbin_rcx
+		push	mbin_rdx
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
+
+		mov	eax, 1
+		cpuid
+		and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+		cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+		lea	mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
+		jne	_%1_shani_check ; AVX is not available so check shani
+		mov	mbin_rsi, mbin_rbx
+
+		;; Try for AVX2
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_AVX2
+		lea	mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
+		cmovne	mbin_rsi, mbin_rbx
+
+		;; Does it have xmm and ymm support
+		xor	ecx, ecx
+		xgetbv
+		and	eax, FLAG_XGETBV_EAX_XMM_YMM
+		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+		je	_%1_init_done
+		lea	mbin_rsi, [%2 WRT_OPT]
+
+	_%1_init_done:
+		pop	mbin_rdx
+		pop	mbin_rcx
+		pop	mbin_rbx
+		pop	mbin_rax
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+
+	_%1_shani_check:
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_SHA
+		lea	mbin_rbx, [%5 WRT_OPT] ; SHANI opt func
+		cmovne	mbin_rsi, mbin_rbx
+		jmp	_%1_init_done ; end
+%endmacro
+
+;;;;;
+; mbin_dispatch_base_to_avx512_shani parameters
+; derived from mbin_dispatch_init6
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_2 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+; 6-> AVX512/06 opt func
+; 7-> SHANI opt for GLM
+; 8-> SHANI opt for CNL
+;;;;;
+%macro mbin_dispatch_base_to_avx512_shani 8
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		push	mbin_rax
+		push	mbin_rbx
+		push	mbin_rcx
+		push	mbin_rdx
+		push	mbin_rdi
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+		mov	eax, 1
+		cpuid
+		mov	ebx, ecx ; save cpuid1.ecx
+		test	ecx, FLAG_CPUID1_ECX_SSE4_2
+		je	_%1_init_done	  ; Use base function if no SSE4_2
+		lea	mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
+
+		;; Test for XMM_YMM support/AVX
+		test	ecx, FLAG_CPUID1_ECX_OSXSAVE
+		je	_%1_shani_check
+		xor	ecx, ecx
+		xgetbv	; xcr -> edx:eax
+		mov	edi, eax	  ; save xgetvb.eax
+
+		and	eax, FLAG_XGETBV_EAX_XMM_YMM
+		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+		jne	_%1_shani_check
+		test	ebx, FLAG_CPUID1_ECX_AVX
+		je	_%1_shani_check
+		lea	mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
+
+		;; Test for AVX2
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_AVX2
+		je	_%1_init_done		; No AVX2 possible
+		lea	mbin_rsi, [%5 WRT_OPT] 	; AVX2/04 opt func
+
+		;; Test for AVX512
+		and	edi, FLAG_XGETBV_EAX_ZMM_OPM
+		cmp	edi, FLAG_XGETBV_EAX_ZMM_OPM
+		jne	_%1_init_done	  ; No AVX512 possible
+		and	ebx, FLAGS_CPUID7_EBX_AVX512_G1
+		cmp	ebx, FLAGS_CPUID7_EBX_AVX512_G1
+		lea	mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
+		cmove	mbin_rsi, mbin_rbx
+
+		;; Test for SHANI
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_SHA
+		lea	mbin_rbx, [%8 WRT_OPT] ; SHANI opt sse func
+		cmovne	mbin_rsi, mbin_rbx
+
+	_%1_init_done:
+		pop	mbin_rdi
+		pop	mbin_rdx
+		pop	mbin_rcx
+		pop	mbin_rbx
+		pop	mbin_rax
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+
+	_%1_shani_check:
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_SHA
+		lea	mbin_rbx, [%7 WRT_OPT] ; SHANI opt sse func
+		cmovne	mbin_rsi, mbin_rbx
+		jmp	_%1_init_done ; end
+%endmacro
+
+
+
+%endif ; ifndef _MULTIBINARY_ASM_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/reg_sizes.asm b/src/crypto/isa-l/isa-l_crypto/include/reg_sizes.asm
new file mode 100644
index 000000000..717dd0503
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/reg_sizes.asm
@@ -0,0 +1,442 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef _REG_SIZES_ASM_
+%define _REG_SIZES_ASM_
+
+%ifndef AS_FEATURE_LEVEL
+%define AS_FEATURE_LEVEL 4
+%endif
+
+%define EFLAGS_HAS_CPUID        (1<<21)
+%define FLAG_CPUID1_ECX_CLMUL   (1<<1)
+%define FLAG_CPUID1_EDX_SSE2    (1<<26)
+%define FLAG_CPUID1_ECX_SSE3	(1)
+%define FLAG_CPUID1_ECX_SSE4_1  (1<<19)
+%define FLAG_CPUID1_ECX_SSE4_2  (1<<20)
+%define FLAG_CPUID1_ECX_POPCNT  (1<<23)
+%define FLAG_CPUID1_ECX_AESNI   (1<<25)
+%define FLAG_CPUID1_ECX_OSXSAVE (1<<27)
+%define FLAG_CPUID1_ECX_AVX     (1<<28)
+%define FLAG_CPUID1_EBX_AVX2    (1<<5)
+
+%define FLAG_CPUID7_EBX_AVX2           (1<<5)
+%define FLAG_CPUID7_EBX_AVX512F        (1<<16)
+%define FLAG_CPUID7_EBX_AVX512DQ       (1<<17)
+%define FLAG_CPUID7_EBX_AVX512IFMA     (1<<21)
+%define FLAG_CPUID7_EBX_AVX512PF       (1<<26)
+%define FLAG_CPUID7_EBX_AVX512ER       (1<<27)
+%define FLAG_CPUID7_EBX_AVX512CD       (1<<28)
+%define FLAG_CPUID7_EBX_SHA            (1<<29)
+%define FLAG_CPUID7_EBX_AVX512BW       (1<<30)
+%define FLAG_CPUID7_EBX_AVX512VL       (1<<31)
+
+%define FLAG_CPUID7_ECX_AVX512VBMI     (1<<1)
+%define FLAG_CPUID7_ECX_AVX512VBMI2    (1 << 6)
+%define FLAG_CPUID7_ECX_GFNI           (1 << 8)
+%define FLAG_CPUID7_ECX_VAES           (1 << 9)
+%define FLAG_CPUID7_ECX_VPCLMULQDQ     (1 << 10)
+%define FLAG_CPUID7_ECX_VNNI           (1 << 11)
+%define FLAG_CPUID7_ECX_BITALG         (1 << 12)
+%define FLAG_CPUID7_ECX_VPOPCNTDQ      (1 << 14)
+
+%define FLAGS_CPUID7_EBX_AVX512_G1 (FLAG_CPUID7_EBX_AVX512F | FLAG_CPUID7_EBX_AVX512VL | FLAG_CPUID7_EBX_AVX512BW | FLAG_CPUID7_EBX_AVX512CD | FLAG_CPUID7_EBX_AVX512DQ)
+%define FLAGS_CPUID7_ECX_AVX512_G2 (FLAG_CPUID7_ECX_AVX512VBMI2 | FLAG_CPUID7_ECX_GFNI | FLAG_CPUID7_ECX_VAES | FLAG_CPUID7_ECX_VPCLMULQDQ | FLAG_CPUID7_ECX_VNNI | FLAG_CPUID7_ECX_BITALG | FLAG_CPUID7_ECX_VPOPCNTDQ)
+
+%define FLAG_XGETBV_EAX_XMM            (1<<1)
+%define FLAG_XGETBV_EAX_YMM            (1<<2)
+%define FLAG_XGETBV_EAX_XMM_YMM        0x6
+%define FLAG_XGETBV_EAX_ZMM_OPM        0xe0
+
+%define FLAG_CPUID1_EAX_AVOTON     0x000406d0
+%define FLAG_CPUID1_EAX_STEP_MASK  0xfffffff0
+
+; define d and w variants for registers
+
+%define	raxd	eax
+%define raxw	ax
+%define raxb	al
+
+%define	rbxd	ebx
+%define rbxw	bx
+%define rbxb	bl
+
+%define	rcxd	ecx
+%define rcxw	cx
+%define rcxb	cl
+
+%define	rdxd	edx
+%define rdxw	dx
+%define rdxb	dl
+
+%define	rsid	esi
+%define rsiw	si
+%define rsib	sil
+
+%define	rdid	edi
+%define rdiw	di
+%define rdib	dil
+
+%define	rbpd	ebp
+%define rbpw	bp
+%define rbpb	bpl
+
+%define zmm0x xmm0
+%define zmm1x xmm1
+%define zmm2x xmm2
+%define zmm3x xmm3
+%define zmm4x xmm4
+%define zmm5x xmm5
+%define zmm6x xmm6
+%define zmm7x xmm7
+%define zmm8x xmm8
+%define zmm9x xmm9
+%define zmm10x xmm10
+%define zmm11x xmm11
+%define zmm12x xmm12
+%define zmm13x xmm13
+%define zmm14x xmm14
+%define zmm15x xmm15
+%define zmm16x xmm16
+%define zmm17x xmm17
+%define zmm18x xmm18
+%define zmm19x xmm19
+%define zmm20x xmm20
+%define zmm21x xmm21
+%define zmm22x xmm22
+%define zmm23x xmm23
+%define zmm24x xmm24
+%define zmm25x xmm25
+%define zmm26x xmm26
+%define zmm27x xmm27
+%define zmm28x xmm28
+%define zmm29x xmm29
+%define zmm30x xmm30
+%define zmm31x xmm31
+
+%define ymm0x xmm0
+%define ymm1x xmm1
+%define ymm2x xmm2
+%define ymm3x xmm3
+%define ymm4x xmm4
+%define ymm5x xmm5
+%define ymm6x xmm6
+%define ymm7x xmm7
+%define ymm8x xmm8
+%define ymm9x xmm9
+%define ymm10x xmm10
+%define ymm11x xmm11
+%define ymm12x xmm12
+%define ymm13x xmm13
+%define ymm14x xmm14
+%define ymm15x xmm15
+%define ymm16x xmm16
+%define ymm17x xmm17
+%define ymm18x xmm18
+%define ymm19x xmm19
+%define ymm20x xmm20
+%define ymm21x xmm21
+%define ymm22x xmm22
+%define ymm23x xmm23
+%define ymm24x xmm24
+%define ymm25x xmm25
+%define ymm26x xmm26
+%define ymm27x xmm27
+%define ymm28x xmm28
+%define ymm29x xmm29
+%define ymm30x xmm30
+%define ymm31x xmm31
+
+%define xmm0x xmm0
+%define xmm1x xmm1
+%define xmm2x xmm2
+%define xmm3x xmm3
+%define xmm4x xmm4
+%define xmm5x xmm5
+%define xmm6x xmm6
+%define xmm7x xmm7
+%define xmm8x xmm8
+%define xmm9x xmm9
+%define xmm10x xmm10
+%define xmm11x xmm11
+%define xmm12x xmm12
+%define xmm13x xmm13
+%define xmm14x xmm14
+%define xmm15x xmm15
+%define xmm16x xmm16
+%define xmm17x xmm17
+%define xmm18x xmm18
+%define xmm19x xmm19
+%define xmm20x xmm20
+%define xmm21x xmm21
+%define xmm22x xmm22
+%define xmm23x xmm23
+%define xmm24x xmm24
+%define xmm25x xmm25
+%define xmm26x xmm26
+%define xmm27x xmm27
+%define xmm28x xmm28
+%define xmm29x xmm29
+%define xmm30x xmm30
+%define xmm31x xmm31
+
+%define zmm0y ymm0
+%define zmm1y ymm1
+%define zmm2y ymm2
+%define zmm3y ymm3
+%define zmm4y ymm4
+%define zmm5y ymm5
+%define zmm6y ymm6
+%define zmm7y ymm7
+%define zmm8y ymm8
+%define zmm9y ymm9
+%define zmm10y ymm10
+%define zmm11y ymm11
+%define zmm12y ymm12
+%define zmm13y ymm13
+%define zmm14y ymm14
+%define zmm15y ymm15
+%define zmm16y ymm16
+%define zmm17y ymm17
+%define zmm18y ymm18
+%define zmm19y ymm19
+%define zmm20y ymm20
+%define zmm21y ymm21
+%define zmm22y ymm22
+%define zmm23y ymm23
+%define zmm24y ymm24
+%define zmm25y ymm25
+%define zmm26y ymm26
+%define zmm27y ymm27
+%define zmm28y ymm28
+%define zmm29y ymm29
+%define zmm30y ymm30
+%define zmm31y ymm31
+
+%define xmm0y ymm0
+%define xmm1y ymm1
+%define xmm2y ymm2
+%define xmm3y ymm3
+%define xmm4y ymm4
+%define xmm5y ymm5
+%define xmm6y ymm6
+%define xmm7y ymm7
+%define xmm8y ymm8
+%define xmm9y ymm9
+%define xmm10y ymm10
+%define xmm11y ymm11
+%define xmm12y ymm12
+%define xmm13y ymm13
+%define xmm14y ymm14
+%define xmm15y ymm15
+%define xmm16y ymm16
+%define xmm17y ymm17
+%define xmm18y ymm18
+%define xmm19y ymm19
+%define xmm20y ymm20
+%define xmm21y ymm21
+%define xmm22y ymm22
+%define xmm23y ymm23
+%define xmm24y ymm24
+%define xmm25y ymm25
+%define xmm26y ymm26
+%define xmm27y ymm27
+%define xmm28y ymm28
+%define xmm29y ymm29
+%define xmm30y ymm30
+%define xmm31y ymm31
+
+%define xmm0z zmm0
+%define xmm1z zmm1
+%define xmm2z zmm2
+%define xmm3z zmm3
+%define xmm4z zmm4
+%define xmm5z zmm5
+%define xmm6z zmm6
+%define xmm7z zmm7
+%define xmm8z zmm8
+%define xmm9z zmm9
+%define xmm10z zmm10
+%define xmm11z zmm11
+%define xmm12z zmm12
+%define xmm13z zmm13
+%define xmm14z zmm14
+%define xmm15z zmm15
+%define xmm16z zmm16
+%define xmm17z zmm17
+%define xmm18z zmm18
+%define xmm19z zmm19
+%define xmm20z zmm20
+%define xmm21z zmm21
+%define xmm22z zmm22
+%define xmm23z zmm23
+%define xmm24z zmm24
+%define xmm25z zmm25
+%define xmm26z zmm26
+%define xmm27z zmm27
+%define xmm28z zmm28
+%define xmm29z zmm29
+%define xmm30z zmm30
+%define xmm31z zmm31
+
+%define ymm0z zmm0
+%define ymm1z zmm1
+%define ymm2z zmm2
+%define ymm3z zmm3
+%define ymm4z zmm4
+%define ymm5z zmm5
+%define ymm6z zmm6
+%define ymm7z zmm7
+%define ymm8z zmm8
+%define ymm9z zmm9
+%define ymm10z zmm10
+%define ymm11z zmm11
+%define ymm12z zmm12
+%define ymm13z zmm13
+%define ymm14z zmm14
+%define ymm15z zmm15
+%define ymm16z zmm16
+%define ymm17z zmm17
+%define ymm18z zmm18
+%define ymm19z zmm19
+%define ymm20z zmm20
+%define ymm21z zmm21
+%define ymm22z zmm22
+%define ymm23z zmm23
+%define ymm24z zmm24
+%define ymm25z zmm25
+%define ymm26z zmm26
+%define ymm27z zmm27
+%define ymm28z zmm28
+%define ymm29z zmm29
+%define ymm30z zmm30
+%define ymm31z zmm31
+
+%define DWORD(reg) reg %+ d
+%define WORD(reg)  reg %+ w
+%define BYTE(reg)  reg %+ b
+
+%define XWORD(reg) reg %+ x
+%define YWORD(reg) reg %+ y
+%define ZWORD(reg) reg %+ z
+
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+ %define __x86_64__
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%endif
+%ifidn __OUTPUT_FORMAT__,win64
+ %define __x86_64__
+%endif
+%ifidn __OUTPUT_FORMAT__,macho64
+ %define __x86_64__
+%endif
+
+%ifdef __x86_64__
+ %define endbranch db 0xf3, 0x0f, 0x1e, 0xfa
+%else
+ %define endbranch db 0xf3, 0x0f, 0x1e, 0xfb
+%endif
+
+%ifdef REL_TEXT
+ %define WRT_OPT
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define WRT_OPT        wrt ..plt
+%else
+ %define WRT_OPT
+%endif
+
+%macro mk_global 1-3
+  %ifdef __NASM_VER__
+    %ifidn __OUTPUT_FORMAT__, macho64
+	global %1
+    %elifidn __OUTPUT_FORMAT__, win64
+	global %1
+    %else
+	global %1:%2 %3
+    %endif
+  %else
+	global %1:%2 %3
+  %endif
+%endmacro
+
+
+; Fixes for nasm lack of MS proc helpers
+%ifdef __NASM_VER__
+  %ifidn __OUTPUT_FORMAT__, win64
+    %macro alloc_stack 1
+	sub	rsp, %1
+    %endmacro
+
+    %macro proc_frame 1
+	%1:
+    %endmacro
+
+    %macro save_xmm128 2
+	movdqa	[rsp + %2], %1
+    %endmacro
+
+    %macro save_reg 2
+	mov	[rsp + %2], %1
+    %endmacro
+
+    %macro rex_push_reg	1
+	push	%1
+    %endmacro
+
+    %macro push_reg 1
+	push	%1
+    %endmacro
+
+    %define end_prolog
+  %endif
+
+  %define endproc_frame
+%endif
+
+%ifidn __OUTPUT_FORMAT__, macho64
+ %define elf64 macho64
+ mac_equ equ 1
+%endif
+
+%macro slversion 4
+	section .text
+	global %1_slver_%2%3%4
+	global %1_slver
+	%1_slver:
+	%1_slver_%2%3%4:
+		dw 0x%4
+		db 0x%3, 0x%2
+%endmacro
+
+%endif ; ifndef _REG_SIZES_ASM_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/rolling_hashx.h b/src/crypto/isa-l/isa-l_crypto/include/rolling_hashx.h
new file mode 100644
index 000000000..035cf1701
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/rolling_hashx.h
@@ -0,0 +1,114 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/**
+ *  @file  rolling_hashx.h
+ *  @brief Fingerprint functions based on rolling hash
+ *
+ *  rolling_hash2 - checks hash in a sliding window based on random 64-bit hash.
+ */
+
+#ifndef _ROLLING_HASHX_H_
+#define _ROLLING_HASHX_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+/**
+ *@brief rolling hash return values
+ */
+enum {
+	FINGERPRINT_RET_HIT = 0, //!< Fingerprint trigger hit
+	FINGERPRINT_RET_MAX,     //!< Fingerprint max length reached before hit
+	FINGERPRINT_RET_OTHER    //!< Fingerprint function error returned
+};
+
+#define FINGERPRINT_MAX_WINDOW 48
+
+/**
+ * @brief Context for rolling_hash2 functions
+ */
+struct rh_state2 {
+	uint8_t history[FINGERPRINT_MAX_WINDOW];
+	uint64_t table1[256];
+	uint64_t table2[256];
+	uint64_t hash;
+	uint32_t w;
+};
+
+/**
+ * @brief Initialize state object for rolling hash2
+ *
+ * @param state Structure holding state info on current rolling hash
+ * @param w     Window width (1 <= w <= 32)
+ * @returns 0 - success, -1 - failure
+ */
+int rolling_hash2_init(struct rh_state2 *state, uint32_t w);
+
+/**
+ * @brief Reset the hash state history
+ *
+ * @param state Structure holding state info on current rolling hash
+ * @param init_bytes Optional window size buffer to pre-init hash
+ * @returns none
+ */
+void rolling_hash2_reset(struct rh_state2 *state, uint8_t * init_bytes);
+
+/**
+ * @brief Run rolling hash function until trigger met or max length reached
+ *
+ * Checks for trigger based on a random hash in a sliding window.
+ * @param state   Structure holding state info on current rolling hash
+ * @param buffer  Pointer to input buffer to run windowed hash on
+ * @param max_len Max length to run over input
+ * @param mask    Mask bits ORed with hash before test with trigger
+ * @param trigger Match value to compare with windowed hash at each input byte
+ * @param offset  Offset from buffer to match, set if match found
+ * @returns FINGERPRINT_RET_HIT - match found, FINGERPRINT_RET_MAX - exceeded max length
+ */
+int rolling_hash2_run(struct rh_state2 *state, uint8_t * buffer, uint32_t max_len,
+		      uint32_t mask, uint32_t trigger, uint32_t * offset);
+
+/**
+ * @brief Generate an appropriate mask to target mean hit rate
+ *
+ * @param mean  Target chunk size in bytes
+ * @param shift Bits to rotate result to get independent masks
+ * @returns 32-bit mask value
+ */
+uint32_t rolling_hashx_mask_gen(long mean, int shift);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _ROLLING_HASHX_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/sha1_mb.h b/src/crypto/isa-l/isa-l_crypto/include/sha1_mb.h
new file mode 100644
index 000000000..3a41684b4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/sha1_mb.h
@@ -0,0 +1,450 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _SHA1_MB_H_
+#define _SHA1_MB_H_
+
+/**
+ *  @file sha1_mb.h
+ *  @brief Multi-buffer CTX API SHA1 function prototypes and structures
+ *
+ * Interface for multi-buffer SHA1 functions
+ *
+ * <b> Multi-buffer SHA1  Entire or First-Update..Update-Last </b>
+ *
+ * The interface to this multi-buffer hashing code is carried out through the
+ * context-level (CTX) init, submit and flush functions and the SHA1_HASH_CTX_MGR and
+ * SHA1_HASH_CTX objects. Numerous SHA1_HASH_CTX objects may be instantiated by the
+ * application for use with a single SHA1_HASH_CTX_MGR.
+ *
+ * The CTX interface functions carry out the initialization and padding of the jobs
+ * entered by the user and add them to the multi-buffer manager. The lower level "scheduler"
+ * layer then processes the jobs in an out-of-order manner. The scheduler layer functions
+ * are internal and are not intended to be invoked directly. Jobs can be submitted
+ * to a CTX as a complete buffer to be hashed, using the HASH_ENTIRE flag, or as partial
+ * jobs which can be started using the HASH_FIRST flag, and later resumed or finished
+ * using the HASH_UPDATE and HASH_LAST flags respectively.
+ *
+ * <b>Note:</b> The submit function does not require data buffers to be block sized.
+ *
+ * The SHA1 CTX interface functions are available for 4 architectures: SSE, AVX, AVX2 and
+ * AVX512. In addition, a multibinary interface is provided, which selects the appropriate
+ * architecture-specific function at runtime.
+ *
+ * <b>Usage:</b> The application creates a SHA1_HASH_CTX_MGR object and initializes it
+ * with a call to sha1_ctx_mgr_init*() function, where henceforth "*" stands for the
+ * relevant suffix for each architecture; _sse, _avx, _avx2, _avx512(or no suffix for the
+ * multibinary version). The SHA1_HASH_CTX_MGR object will be used to schedule processor
+ * resources, with up to 4 SHA1_HASH_CTX objects (or 8 in the AVX2 case, 16 in the AVX512)
+ * being processed at a time.
+ *
+ * Each SHA1_HASH_CTX must be initialized before first use by the hash_ctx_init macro
+ * defined in multi_buffer.h. After initialization, the application may begin computing
+ * a hash by giving the SHA1_HASH_CTX to a SHA1_HASH_CTX_MGR using the submit functions
+ * sha1_ctx_mgr_submit*() with the HASH_FIRST flag set. When the SHA1_HASH_CTX is
+ * returned to the application (via this or a later call to sha1_ctx_mgr_submit*() or
+ * sha1_ctx_mgr_flush*()), the application can then re-submit it with another call to
+ * sha1_ctx_mgr_submit*(), but without the HASH_FIRST flag set.
+ *
+ * Ideally, on the last buffer for that hash, sha1_ctx_mgr_submit_sse is called with
+ * HASH_LAST, although it is also possible to submit the hash with HASH_LAST and a zero
+ * length if necessary. When a SHA1_HASH_CTX is returned after having been submitted with
+ * HASH_LAST, it will contain a valid hash. The SHA1_HASH_CTX can be reused immediately
+ * by submitting with HASH_FIRST.
+ *
+ * For example, you would submit hashes with the following flags for the following numbers
+ * of buffers:
+ * <ul>
+ *  <li> one buffer: HASH_FIRST | HASH_LAST  (or, equivalently, HASH_ENTIRE)
+ *  <li> two buffers: HASH_FIRST, HASH_LAST
+ *  <li> three buffers: HASH_FIRST, HASH_UPDATE, HASH_LAST
+ * etc.
+ * </ul>
+ *
+ * The order in which SHA1_CTX objects are returned is in general different from the order
+ * in which they are submitted.
+ *
+ * A few possible error conditions exist:
+ * <ul>
+ *  <li> Submitting flags other than the allowed entire/first/update/last values
+ *  <li> Submitting a context that is currently being managed by a SHA1_HASH_CTX_MGR.
+ *  <li> Submitting a context after HASH_LAST is used but before HASH_FIRST is set.
+ * </ul>
+ *
+ *  These error conditions are reported by returning the SHA1_HASH_CTX immediately after
+ *  a submit with its error member set to a non-zero error code (defined in
+ *  multi_buffer.h). No changes are made to the SHA1_HASH_CTX_MGR in the case of an
+ *  error; no processing is done for other hashes.
+ *
+ */
+
+#include <stdint.h>
+#include "multi_buffer.h"
+#include "types.h"
+
+#ifndef _MSC_VER
+#include <stdbool.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Hash Constants and Typedefs
+#define SHA1_DIGEST_NWORDS		5
+#define SHA1_MAX_LANES			16
+#define SHA1_X8_LANES			8
+#define SHA1_MIN_LANES			4
+#define SHA1_BLOCK_SIZE			64
+#define SHA1_LOG2_BLOCK_SIZE		6
+#define SHA1_PADLENGTHFIELD_SIZE	8
+#define SHA1_INITIAL_DIGEST		\
+	0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0
+
+typedef uint32_t sha1_digest_array[SHA1_DIGEST_NWORDS][SHA1_MAX_LANES];
+typedef uint32_t SHA1_WORD_T;
+
+/** @brief Scheduler layer - Holds info describing a single SHA1 job for the multi-buffer manager */
+
+typedef struct {
+	uint8_t*  buffer;	//!< pointer to data buffer for this job
+	uint32_t  len;		//!< length of buffer for this job in blocks.
+	DECLARE_ALIGNED(uint32_t result_digest[SHA1_DIGEST_NWORDS],64);
+	JOB_STS status;		//!< output job status
+	void*   user_data;	//!< pointer for user's job-related data
+} SHA1_JOB;
+
+/** @brief Scheduler layer -  Holds arguments for submitted SHA1 job */
+
+typedef struct {
+	sha1_digest_array digest;
+	uint8_t*	data_ptr[SHA1_MAX_LANES];
+} SHA1_MB_ARGS_X16;
+
+/** @brief Scheduler layer - Lane data */
+
+typedef struct {
+	SHA1_JOB *job_in_lane;
+} SHA1_LANE_DATA;
+
+/** @brief Scheduler layer - Holds state for multi-buffer SHA1 jobs */
+
+typedef struct {
+	SHA1_MB_ARGS_X16 args;
+	uint32_t lens[SHA1_MAX_LANES];
+	uint64_t unused_lanes; //!< each nibble is index (0...3 or 0...7 or 0...15) of unused lanes, nibble 4 or 8 is set to F as a flag
+	SHA1_LANE_DATA ldata[SHA1_MAX_LANES];
+	uint32_t num_lanes_inuse;
+} SHA1_MB_JOB_MGR;
+
+/** @brief Context layer - Holds state for multi-buffer SHA1 jobs */
+
+typedef struct {
+	SHA1_MB_JOB_MGR mgr;
+} SHA1_HASH_CTX_MGR;
+
+/** @brief Context layer - Holds info describing a single SHA1 job for the multi-buffer CTX manager */
+
+typedef struct {
+	SHA1_JOB       job;		// Must be at struct offset 0.
+	HASH_CTX_STS   status;		//!< Context status flag
+	HASH_CTX_ERROR error;		//!< Context error flag
+	uint64_t       total_length;	//!< Running counter of length processed for this CTX's job
+	const void*    incoming_buffer; //!< pointer to data input buffer for this CTX's job
+	uint32_t       incoming_buffer_length; //!< length of buffer for this job in bytes.
+	uint8_t        partial_block_buffer[SHA1_BLOCK_SIZE * 2]; //!< CTX partial blocks
+	uint32_t       partial_block_buffer_length;
+	void*          user_data;	//!< pointer for user to keep any job-related data
+} SHA1_HASH_CTX;
+
+/******************** multibinary function prototypes **********************/
+
+/**
+ * @brief Initialize the SHA1 multi-buffer manager structure.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void      sha1_ctx_mgr_init (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA1 job to the multi-buffer manager.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+				const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush (SHA1_HASH_CTX_MGR* mgr);
+
+
+/*******************************************************************
+ * Context level API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the context level SHA1 multi-buffer manager structure.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void      sha1_ctx_mgr_init_sse (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA1 job to the context level multi-buffer manager.
+ * @requires SSE4.1
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit_sse (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+				const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires SSE4.1
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush_sse (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the context level SHA1 multi-buffer manager structure.
+ * @requires SSE4.1 and SHANI
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void      sha1_ctx_mgr_init_sse_ni (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA1 job to the context level multi-buffer manager.
+ * @requires SSE4.1 and SHANI
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit_sse_ni (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+				const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires SSE4.1 and SHANI
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush_sse_ni (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA1 multi-buffer manager structure.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void      sha1_ctx_mgr_init_avx (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA1 job to the multi-buffer manager.
+ * @requires AVX
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit_avx (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+				const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires AVX
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush_avx (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA1 multi-buffer manager structure.
+ * @requires AVX2
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns void
+ */
+void      sha1_ctx_mgr_init_avx2 (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA1 job to the multi-buffer manager.
+ * @requires AVX2
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit_avx2 (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+				const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires AVX2
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush_avx2 (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA1 multi-buffer manager structure.
+ * @requires AVX512
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns void
+ */
+void      sha1_ctx_mgr_init_avx512 (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA1 job to the multi-buffer manager.
+ * @requires AVX512
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit_avx512 (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+				const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires AVX512
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush_avx512 (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA1 multi-buffer manager structure.
+ * @requires AVX512 and SHANI
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns void
+ */
+void      sha1_ctx_mgr_init_avx512_ni (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA1 job to the multi-buffer manager.
+ * @requires AVX512 and SHANI
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit_avx512_ni (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+				const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires AVX512 and SHANI
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush_avx512_ni (SHA1_HASH_CTX_MGR* mgr);
+
+
+/*******************************************************************
+ * Scheduler (internal) level out-of-order function prototypes
+ ******************************************************************/
+
+void      sha1_mb_mgr_init_sse    (SHA1_MB_JOB_MGR *state);
+SHA1_JOB* sha1_mb_mgr_submit_sse  (SHA1_MB_JOB_MGR *state, SHA1_JOB* job);
+SHA1_JOB* sha1_mb_mgr_flush_sse   (SHA1_MB_JOB_MGR *state);
+
+#define   sha1_mb_mgr_init_avx    sha1_mb_mgr_init_sse
+SHA1_JOB* sha1_mb_mgr_submit_avx  (SHA1_MB_JOB_MGR *state, SHA1_JOB* job);
+SHA1_JOB* sha1_mb_mgr_flush_avx   (SHA1_MB_JOB_MGR *state);
+
+void      sha1_mb_mgr_init_avx2   (SHA1_MB_JOB_MGR *state);
+SHA1_JOB* sha1_mb_mgr_submit_avx2 (SHA1_MB_JOB_MGR *state, SHA1_JOB* job);
+SHA1_JOB* sha1_mb_mgr_flush_avx2  (SHA1_MB_JOB_MGR *state);
+
+void      sha1_mb_mgr_init_avx512   (SHA1_MB_JOB_MGR *state);
+SHA1_JOB* sha1_mb_mgr_submit_avx512 (SHA1_MB_JOB_MGR *state, SHA1_JOB* job);
+SHA1_JOB* sha1_mb_mgr_flush_avx512  (SHA1_MB_JOB_MGR *state);
+
+void      sha1_mb_mgr_init_sse_ni    (SHA1_MB_JOB_MGR *state);
+SHA1_JOB* sha1_mb_mgr_submit_sse_ni  (SHA1_MB_JOB_MGR *state, SHA1_JOB* job);
+SHA1_JOB* sha1_mb_mgr_flush_sse_ni   (SHA1_MB_JOB_MGR *state);
+
+void      sha1_mb_mgr_init_avx512_ni    (SHA1_MB_JOB_MGR *state);
+SHA1_JOB* sha1_mb_mgr_submit_avx512_ni  (SHA1_MB_JOB_MGR *state, SHA1_JOB* job);
+SHA1_JOB* sha1_mb_mgr_flush_avx512_ni   (SHA1_MB_JOB_MGR *state);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _SHA1_MB_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/sha256_mb.h b/src/crypto/isa-l/isa-l_crypto/include/sha256_mb.h
new file mode 100644
index 000000000..8ef186b2d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/sha256_mb.h
@@ -0,0 +1,451 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _SHA256_MB_H_
+#define _SHA256_MB_H_
+
+/**
+ *  @file sha256_mb.h
+ *  @brief Multi-buffer CTX API SHA256 function prototypes and structures
+ *
+ * Interface for multi-buffer SHA256 functions
+ *
+ * <b> Multi-buffer SHA256  Entire or First-Update..Update-Last </b>
+ *
+ * The interface to this multi-buffer hashing code is carried out through the
+ * context-level (CTX) init, submit and flush functions and the SHA256_HASH_CTX_MGR and
+ * SHA256_HASH_CTX objects. Numerous SHA256_HASH_CTX objects may be instantiated by the
+ * application for use with a single SHA256_HASH_CTX_MGR.
+ *
+ * The CTX interface functions carry out the initialization and padding of the jobs
+ * entered by the user and add them to the multi-buffer manager. The lower level "scheduler"
+ * layer then processes the jobs in an out-of-order manner. The scheduler layer functions
+ * are internal and are not intended to be invoked directly. Jobs can be submitted
+ * to a CTX as a complete buffer to be hashed, using the HASH_ENTIRE flag, or as partial
+ * jobs which can be started using the HASH_FIRST flag, and later resumed or finished
+ * using the HASH_UPDATE and HASH_LAST flags respectively.
+ *
+ * <b>Note:</b> The submit function does not require data buffers to be block sized.
+ *
+ * The SHA256 CTX interface functions are available for 4 architectures: SSE, AVX, AVX2 and
+ * AVX512. In addition, a multibinary interface is provided, which selects the appropriate
+ * architecture-specific function at runtime.
+ *
+ * <b>Usage:</b> The application creates a SHA256_HASH_CTX_MGR object and initializes it
+ * with a call to sha256_ctx_mgr_init*() function, where henceforth "*" stands for the
+ * relevant suffix for each architecture; _sse, _avx, _avx2, _avx512(or no suffix for the
+ * multibinary version). The SHA256_HASH_CTX_MGR object will be used to schedule processor
+ * resources, with up to 4 SHA256_HASH_CTX objects (or 8 in the AVX2 case, 16 in the AVX512)
+ * being processed at a time.
+ *
+ * Each SHA256_HASH_CTX must be initialized before first use by the hash_ctx_init macro
+ * defined in multi_buffer.h. After initialization, the application may begin computing
+ * a hash by giving the SHA256_HASH_CTX to a SHA256_HASH_CTX_MGR using the submit functions
+ * sha256_ctx_mgr_submit*() with the HASH_FIRST flag set. When the SHA256_HASH_CTX is
+ * returned to the application (via this or a later call to sha256_ctx_mgr_submit*() or
+ * sha256_ctx_mgr_flush*()), the application can then re-submit it with another call to
+ * sha256_ctx_mgr_submit*(), but without the HASH_FIRST flag set.
+ *
+ * Ideally, on the last buffer for that hash, sha256_ctx_mgr_submit_sse is called with
+ * HASH_LAST, although it is also possible to submit the hash with HASH_LAST and a zero
+ * length if necessary. When a SHA256_HASH_CTX is returned after having been submitted with
+ * HASH_LAST, it will contain a valid hash. The SHA256_HASH_CTX can be reused immediately
+ * by submitting with HASH_FIRST.
+ *
+ * For example, you would submit hashes with the following flags for the following numbers
+ * of buffers:
+ * <ul>
+ *  <li> one buffer: HASH_FIRST | HASH_LAST  (or, equivalently, HASH_ENTIRE)
+ *  <li> two buffers: HASH_FIRST, HASH_LAST
+ *  <li> three buffers: HASH_FIRST, HASH_UPDATE, HASH_LAST
+ * etc.
+ * </ul>
+ *
+ * The order in which SHA256_CTX objects are returned is in general different from the order
+ * in which they are submitted.
+ *
+ * A few possible error conditions exist:
+ * <ul>
+ *  <li> Submitting flags other than the allowed entire/first/update/last values
+ *  <li> Submitting a context that is currently being managed by a SHA256_HASH_CTX_MGR.
+ *  <li> Submitting a context after HASH_LAST is used but before HASH_FIRST is set.
+ * </ul>
+ *
+ *  These error conditions are reported by returning the SHA256_HASH_CTX immediately after
+ *  a submit with its error member set to a non-zero error code (defined in
+ *  multi_buffer.h). No changes are made to the SHA256_HASH_CTX_MGR in the case of an
+ *  error; no processing is done for other hashes.
+ *
+ */
+
+#include <stdint.h>
+#include "multi_buffer.h"
+#include "types.h"
+
+#ifndef _MSC_VER
+#include <stdbool.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Hash Constants and Typedefs
+#define SHA256_DIGEST_NWORDS		8
+#define SHA256_MAX_LANES		16
+#define SHA256_X8_LANES			8
+#define SHA256_MIN_LANES		4
+#define SHA256_BLOCK_SIZE		64
+#define SHA256_LOG2_BLOCK_SIZE		6
+#define SHA256_PADLENGTHFIELD_SIZE	8
+#define SHA256_INITIAL_DIGEST		\
+	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, \
+	0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+
+typedef uint32_t sha256_digest_array[SHA256_DIGEST_NWORDS][SHA256_MAX_LANES];
+typedef uint32_t SHA256_WORD_T;
+
+/** @brief Scheduler layer - Holds info describing a single SHA256 job for the multi-buffer manager */
+
+typedef struct {
+	uint8_t*  buffer;	//!< pointer to data buffer for this job
+	uint64_t  len;		//!< length of buffer for this job in blocks.
+	DECLARE_ALIGNED(uint32_t result_digest[SHA256_DIGEST_NWORDS], 64);
+	JOB_STS status;		//!< output job status
+	void*   user_data;	//!< pointer for user's job-related data
+} SHA256_JOB;
+
+/** @brief Scheduler layer -  Holds arguments for submitted SHA256 job */
+
+typedef struct {
+	sha256_digest_array digest;
+	uint8_t* data_ptr[SHA256_MAX_LANES];
+} SHA256_MB_ARGS_X16;
+
+/** @brief Scheduler layer - Lane data */
+
+typedef struct {
+	SHA256_JOB *job_in_lane;
+} SHA256_LANE_DATA;
+
+/** @brief Scheduler layer - Holds state for multi-buffer SHA256 jobs */
+
+typedef struct {
+	SHA256_MB_ARGS_X16 args;
+	uint32_t lens[SHA256_MAX_LANES];
+	uint64_t unused_lanes; //!< each nibble is index (0...3 or 0...7) of unused lanes, nibble 4 or 8 is set to F as a flag
+	SHA256_LANE_DATA ldata[SHA256_MAX_LANES];
+	uint32_t num_lanes_inuse;
+} SHA256_MB_JOB_MGR;
+
+/** @brief Context layer - Holds state for multi-buffer SHA256 jobs */
+
+typedef struct {
+	SHA256_MB_JOB_MGR mgr;
+} SHA256_HASH_CTX_MGR;
+
+/** @brief Context layer - Holds info describing a single SHA256 job for the multi-buffer CTX manager */
+
+typedef struct {
+	SHA256_JOB	job; // Must be at struct offset 0.
+	HASH_CTX_STS	status;		//!< Context status flag
+	HASH_CTX_ERROR	error;		//!< Context error flag
+	uint64_t	total_length;	//!< Running counter of length processed for this CTX's job
+	const void*	incoming_buffer; //!< pointer to data input buffer for this CTX's job
+	uint32_t	incoming_buffer_length; //!< length of buffer for this job in bytes.
+	uint8_t		partial_block_buffer[SHA256_BLOCK_SIZE * 2]; //!< CTX partial blocks
+	uint32_t	partial_block_buffer_length;
+	void*		user_data;	//!< pointer for user to keep any job-related data
+} SHA256_HASH_CTX;
+
+/******************** multibinary function prototypes **********************/
+
+/**
+ * @brief Initialize the SHA256 multi-buffer manager structure.
+ * @requires SSE4.1 or AVX or AVX2
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns void
+ */
+void      sha256_ctx_mgr_init   (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA256 job to the multi-buffer manager.
+ * @requires SSE4.1 or AVX or AVX2
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+					const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires SSE4.1 or AVX or AVX2
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush  (SHA256_HASH_CTX_MGR* mgr);
+
+
+/*******************************************************************
+ * CTX level API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the context level SHA256 multi-buffer manager structure.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void      sha256_ctx_mgr_init_sse   (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA256 job to the context level multi-buffer manager.
+ * @requires SSE4.1
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit_sse (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+					const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires SSE4.1
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush_sse  (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the context level SHA256 multi-buffer manager structure.
+ * @requires SSE4.1 and SHANI
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void      sha256_ctx_mgr_init_sse_ni (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA256 job to the context level multi-buffer manager.
+ * @requires SSE4.1 and SHANI
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit_sse_ni (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+				const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires SSE4.1 and SHANI
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush_sse_ni (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA256 multi-buffer manager structure.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void      sha256_ctx_mgr_init_avx   (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA256 job to the multi-buffer manager.
+ * @requires AVX
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit_avx (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+					const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires AVX
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush_avx  (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA256 multi-buffer manager structure.
+ * @requires AVX2
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns void
+ */
+void      sha256_ctx_mgr_init_avx2   (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA256 job to the multi-buffer manager.
+ * @requires AVX2
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit_avx2 (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+					const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires AVX2
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush_avx2  (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA256 multi-buffer manager structure.
+ * @requires AVX512
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns void
+ */
+void      sha256_ctx_mgr_init_avx512   (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA256 job to the multi-buffer manager.
+ * @requires AVX512
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit_avx512 (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+					const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires AVX512
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush_avx512  (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA256 multi-buffer manager structure.
+ * @requires AVX512 and SHANI
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns void
+ */
+void      sha256_ctx_mgr_init_avx512_ni (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA256 job to the multi-buffer manager.
+ * @requires AVX512 and SHANI
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit_avx512_ni (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+				const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires AVX512 and SHANI
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush_avx512_ni (SHA256_HASH_CTX_MGR* mgr);
+
+
+/*******************************************************************
+ * Scheduler (internal) level out-of-order function prototypes
+ ******************************************************************/
+
+void        sha256_mb_mgr_init_sse   (SHA256_MB_JOB_MGR *state);
+SHA256_JOB* sha256_mb_mgr_submit_sse (SHA256_MB_JOB_MGR *state, SHA256_JOB* job);
+SHA256_JOB* sha256_mb_mgr_flush_sse  (SHA256_MB_JOB_MGR *state);
+
+#define     sha256_mb_mgr_init_avx   sha256_mb_mgr_init_sse
+SHA256_JOB* sha256_mb_mgr_submit_avx (SHA256_MB_JOB_MGR *state, SHA256_JOB* job);
+SHA256_JOB* sha256_mb_mgr_flush_avx  (SHA256_MB_JOB_MGR *state);
+
+void        sha256_mb_mgr_init_avx2   (SHA256_MB_JOB_MGR *state);
+SHA256_JOB* sha256_mb_mgr_submit_avx2 (SHA256_MB_JOB_MGR *state, SHA256_JOB* job);
+SHA256_JOB* sha256_mb_mgr_flush_avx2  (SHA256_MB_JOB_MGR *state);
+
+void        sha256_mb_mgr_init_avx512   (SHA256_MB_JOB_MGR *state);
+SHA256_JOB* sha256_mb_mgr_submit_avx512 (SHA256_MB_JOB_MGR *state, SHA256_JOB* job);
+SHA256_JOB* sha256_mb_mgr_flush_avx512  (SHA256_MB_JOB_MGR *state);
+
+void        sha256_mb_mgr_init_sse_ni    (SHA256_MB_JOB_MGR *state);
+SHA256_JOB* sha256_mb_mgr_submit_sse_ni  (SHA256_MB_JOB_MGR *state, SHA256_JOB* job);
+SHA256_JOB* sha256_mb_mgr_flush_sse_ni   (SHA256_MB_JOB_MGR *state);
+
+void        sha256_mb_mgr_init_avx512_ni    (SHA256_MB_JOB_MGR *state);
+SHA256_JOB* sha256_mb_mgr_submit_avx512_ni  (SHA256_MB_JOB_MGR *state, SHA256_JOB* job);
+SHA256_JOB* sha256_mb_mgr_flush_avx512_ni   (SHA256_MB_JOB_MGR *state);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _SHA256_MB_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/sha512_mb.h b/src/crypto/isa-l/isa-l_crypto/include/sha512_mb.h
new file mode 100644
index 000000000..ce3950ad1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/sha512_mb.h
@@ -0,0 +1,422 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _SHA512_MB_H_
+#define _SHA512_MB_H_
+
+/**
+ *  @file sha512_mb.h
+ *  @brief Single/Multi-buffer CTX API SHA512 function prototypes and structures
+ *
+ * Interface for single and multi-buffer SHA512 functions
+ *
+ * <b> Single/Multi-buffer SHA512  Entire or First-Update..Update-Last </b>
+ *
+ * The interface to this single/multi-buffer hashing code is carried out through the
+ * context-level (CTX) init, submit and flush functions and the SHA512_HASH_CTX_MGR and
+ * SHA512_HASH_CTX objects. Numerous SHA512_HASH_CTX objects may be instantiated by the
+ * application for use with a single SHA512_HASH_CTX_MGR.
+ *
+ * The CTX interface functions carry out the initialization and padding of the jobs
+ * entered by the user and add them to the multi-buffer manager. The lower level "scheduler"
+ * layer then processes the jobs in an out-of-order manner. The scheduler layer functions
+ * are internal and are not intended to be invoked directly. Jobs can be submitted
+ * to a CTX as a complete buffer to be hashed, using the HASH_ENTIRE flag, or as partial
+ * jobs which can be started using the HASH_FIRST flag, and later resumed or finished
+ * using the HASH_UPDATE and HASH_LAST flags respectively.
+ *
+ * <b>Note:</b> The submit function does not require data buffers to be block sized.
+ *
+ * The SHA512 CTX interface functions are available for 5 architectures: multi-buffer SSE,
+ * AVX, AVX2, AVX512 and single-buffer SSE4 (which is used in the same way as the
+ * multi-buffer code). In addition, a multibinary interface is provided, which selects the
+ * appropriate architecture-specific function at runtime. This multibinary interface
+ * selects the single buffer SSE4 functions when the platform is detected to be Silvermont.
+ *
+ * <b>Usage:</b> The application creates a SHA512_HASH_CTX_MGR object and initializes it
+ * with a call to sha512_ctx_mgr_init*() function, where henceforth "*" stands for the
+ * relevant suffix for each architecture; _sse, _avx, _avx2, _avx512(or no suffix for the
+ * multibinary version). The SHA512_HASH_CTX_MGR object will be used to schedule processor
+ * resources, with up to 2 SHA512_HASH_CTX objects (or 4 in the AVX2 case, 8 in the AVX512
+ * case) being processed at a time.
+ *
+ * Each SHA512_HASH_CTX must be initialized before first use by the hash_ctx_init macro
+ * defined in multi_buffer.h. After initialization, the application may begin computing
+ * a hash by giving the SHA512_HASH_CTX to a SHA512_HASH_CTX_MGR using the submit functions
+ * sha512_ctx_mgr_submit*() with the HASH_FIRST flag set. When the SHA512_HASH_CTX is
+ * returned to the application (via this or a later call to sha512_ctx_mgr_submit*() or
+ * sha512_ctx_mgr_flush*()), the application can then re-submit it with another call to
+ * sha512_ctx_mgr_submit*(), but without the HASH_FIRST flag set.
+ *
+ * Ideally, on the last buffer for that hash, sha512_ctx_mgr_submit_sse is called with
+ * HASH_LAST, although it is also possible to submit the hash with HASH_LAST and a zero
+ * length if necessary. When a SHA512_HASH_CTX is returned after having been submitted with
+ * HASH_LAST, it will contain a valid hash. The SHA512_HASH_CTX can be reused immediately
+ * by submitting with HASH_FIRST.
+ *
+ * For example, you would submit hashes with the following flags for the following numbers
+ * of buffers:
+ * <ul>
+ *  <li> one buffer: HASH_FIRST | HASH_LAST  (or, equivalently, HASH_ENTIRE)
+ *  <li> two buffers: HASH_FIRST, HASH_LAST
+ *  <li> three buffers: HASH_FIRST, HASH_UPDATE, HASH_LAST
+ * etc.
+ * </ul>
+ *
+ * The order in which SHA512_CTX objects are returned is in general different from the order
+ * in which they are submitted.
+ *
+ * A few possible error conditions exist:
+ * <ul>
+ *  <li> Submitting flags other than the allowed entire/first/update/last values
+ *  <li> Submitting a context that is currently being managed by a SHA512_HASH_CTX_MGR. (Note:
+ *   This error case is not applicable to the single buffer SSE4 version)
+ *  <li> Submitting a context after HASH_LAST is used but before HASH_FIRST is set.
+ * </ul>
+ *
+ *  These error conditions are reported by returning the SHA512_HASH_CTX immediately after
+ *  a submit with its error member set to a non-zero error code (defined in
+ *  multi_buffer.h). No changes are made to the SHA512_HASH_CTX_MGR in the case of an
+ *  error; no processing is done for other hashes.
+ *
+ */
+
+#include <stdint.h>
+#include "multi_buffer.h"
+#include "types.h"
+
+#ifndef _MSC_VER
+#include <stdbool.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Hash Constants and Typedefs
+#define SHA512_DIGEST_NWORDS		8
+#define SHA512_MAX_LANES		8
+#define SHA512_X4_LANES			4
+#define SHA512_MIN_LANES		2
+#define SHA512_BLOCK_SIZE		128
+#define SHA512_LOG2_BLOCK_SIZE		7
+#define SHA512_PADLENGTHFIELD_SIZE	16
+#define SHA512_INITIAL_DIGEST		\
+	0x6a09e667f3bcc908,0xbb67ae8584caa73b,0x3c6ef372fe94f82b,0xa54ff53a5f1d36f1, \
+	0x510e527fade682d1,0x9b05688c2b3e6c1f,0x1f83d9abfb41bd6b,0x5be0cd19137e2179
+
+
+typedef uint64_t sha512_digest_array[SHA512_DIGEST_NWORDS][SHA512_MAX_LANES];
+typedef uint64_t SHA512_WORD_T;
+
+/** @brief Scheduler layer - Holds info describing a single SHA512 job for the multi-buffer manager */
+
+typedef struct {
+	uint8_t*  buffer;	//!< pointer to data buffer for this job
+	uint64_t  len;		//!< length of buffer for this job in blocks.
+	DECLARE_ALIGNED(uint64_t result_digest[SHA512_DIGEST_NWORDS], 64);
+	JOB_STS status;		//!< output job status
+	void*   user_data;	//!< pointer for user's job-related data
+} SHA512_JOB;
+
+/** @brief Scheduler layer -  Holds arguments for submitted SHA512 job */
+
+typedef struct {
+	sha512_digest_array digest;
+	uint8_t* data_ptr[SHA512_MAX_LANES];
+} SHA512_MB_ARGS_X8;
+
+/** @brief Scheduler layer - Lane data */
+
+typedef struct {
+	SHA512_JOB *job_in_lane;
+} SHA512_LANE_DATA;
+
+/** @brief Scheduler layer - Holds state for multi-buffer SHA512 jobs */
+
+typedef struct {
+	SHA512_MB_ARGS_X8 args;
+	uint64_t lens[SHA512_MAX_LANES];
+	uint64_t unused_lanes;	//!< each byte is index (00, 01 or 00...03) of unused lanes, byte 2 or 4 is set to FF as a flag
+	SHA512_LANE_DATA ldata[SHA512_MAX_LANES];
+	uint32_t num_lanes_inuse;
+} SHA512_MB_JOB_MGR;
+
+/** @brief Context layer - Holds state for multi-buffer SHA512 jobs */
+
+typedef struct {
+	SHA512_MB_JOB_MGR mgr;
+} SHA512_HASH_CTX_MGR;
+
+/** @brief Context layer - Holds info describing a single SHA512 job for the multi-buffer CTX manager */
+
+typedef struct {
+	SHA512_JOB	job; // Must be at struct offset 0.
+	HASH_CTX_STS	status;		//!< Context status flag
+	HASH_CTX_ERROR	error;		//!< Context error flag
+	uint64_t	total_length;	//!< Running counter of length processed for this CTX's job
+	const void*	incoming_buffer; //!< pointer to data input buffer for this CTX's job
+	uint32_t	incoming_buffer_length; //!< length of buffer for this job in bytes.
+	uint8_t		partial_block_buffer[SHA512_BLOCK_SIZE * 2]; //!< CTX partial blocks
+	uint32_t	partial_block_buffer_length;
+	void*		user_data;	//!< pointer for user to keep any job-related data
+} SHA512_HASH_CTX;
+
+/*******************************************************************
+ * Context level API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the context level SHA512 multi-buffer manager structure.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void      sha512_ctx_mgr_init_sse   (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA512 job to the context level multi-buffer manager.
+ * @requires SSE4.1
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_submit_sse (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx,
+					const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA512 jobs and return when complete.
+ * @requires SSE4.1
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_flush_sse  (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA512 multi-buffer manager structure.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void      sha512_ctx_mgr_init_avx   (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA512 job to the multi-buffer manager.
+ * @requires AVX
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_submit_avx (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx,
+					const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA512 jobs and return when complete.
+ * @requires AVX
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_flush_avx  (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA512 multi-buffer manager structure.
+ * @requires AVX2
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns void
+ */
+void      sha512_ctx_mgr_init_avx2   (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA512 job to the multi-buffer manager.
+ * @requires AVX2
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_submit_avx2 (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx,
+					const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA512 jobs and return when complete.
+ * @requires AVX2
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_flush_avx2  (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA512 multi-buffer manager structure.
+ * @requires AVX512
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns void
+ */
+void      sha512_ctx_mgr_init_avx512   (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA512 job to the multi-buffer manager.
+ * @requires AVX512
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_submit_avx512 (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx,
+					const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA512 jobs and return when complete.
+ * @requires AVX512
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_flush_avx512  (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA512 multi-buffer manager structure.
+ * @requires SSE4
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns void
+ */
+void      sha512_ctx_mgr_init_sb_sse4   (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA512 job to the multi-buffer manager.
+ * @requires SSE4
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_submit_sb_sse4 (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx,
+					const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA512 jobs and return when complete.
+ * @requires SSE4
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_flush_sb_sse4  (SHA512_HASH_CTX_MGR* mgr);
+
+/******************** multibinary function prototypes **********************/
+
+/**
+ * @brief Initialize the SHA512 multi-buffer manager structure.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns void
+ */
+void      sha512_ctx_mgr_init   (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief  Submit a new SHA512 job to the multi-buffer manager.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param  mgr Structure holding context level state info
+ * @param  ctx Structure holding ctx job info
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_submit (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx,
+				const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA512 jobs and return when complete.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr	Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_flush  (SHA512_HASH_CTX_MGR* mgr);
+
+/*******************************************************************
+ * Scheduler (internal) level out-of-order function prototypes
+ ******************************************************************/
+
+void        sha512_mb_mgr_init_sse   (SHA512_MB_JOB_MGR *state);
+SHA512_JOB* sha512_mb_mgr_submit_sse (SHA512_MB_JOB_MGR *state, SHA512_JOB* job);
+SHA512_JOB* sha512_mb_mgr_flush_sse  (SHA512_MB_JOB_MGR *state);
+
+#define     sha512_mb_mgr_init_avx   sha512_mb_mgr_init_sse
+SHA512_JOB* sha512_mb_mgr_submit_avx (SHA512_MB_JOB_MGR *state, SHA512_JOB* job);
+SHA512_JOB* sha512_mb_mgr_flush_avx  (SHA512_MB_JOB_MGR *state);
+
+void        sha512_mb_mgr_init_avx2   (SHA512_MB_JOB_MGR *state);
+SHA512_JOB* sha512_mb_mgr_submit_avx2 (SHA512_MB_JOB_MGR *state, SHA512_JOB* job);
+SHA512_JOB* sha512_mb_mgr_flush_avx2  (SHA512_MB_JOB_MGR *state);
+
+void        sha512_mb_mgr_init_avx512   (SHA512_MB_JOB_MGR *state);
+SHA512_JOB* sha512_mb_mgr_submit_avx512 (SHA512_MB_JOB_MGR *state, SHA512_JOB* job);
+SHA512_JOB* sha512_mb_mgr_flush_avx512  (SHA512_MB_JOB_MGR *state);
+
+// Single buffer SHA512 APIs, optimized for SLM.
+void        sha512_sse4              (const void* M, void* D, uint64_t L);
+// Note that these APIs comply with multi-buffer APIs' high level usage
+void        sha512_sb_mgr_init_sse4   (SHA512_MB_JOB_MGR *state);
+SHA512_JOB* sha512_sb_mgr_submit_sse4 (SHA512_MB_JOB_MGR *state, SHA512_JOB* job);
+SHA512_JOB* sha512_sb_mgr_flush_sse4  (SHA512_MB_JOB_MGR *state);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _SHA512_MB_H_
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/include/sm3_mb.h b/src/crypto/isa-l/isa-l_crypto/include/sm3_mb.h
new file mode 100644
index 000000000..d9e7b4eed
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/sm3_mb.h
@@ -0,0 +1,155 @@
+/**********************************************************************
+  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _SM3_MB_H_
+#define _SM3_MB_H_
+
+
+/**
+ *  @file sm3_mb.h
+ *  @brief Multi-buffer CTX API SM3 function prototypes and structures
+ *
+ *  \warning Experimental interface with only base functions available at this
+ *           time.
+ */
+
+#include <stdint.h>
+#include "multi_buffer.h"
+#include "types.h"
+
+#ifndef _MSC_VER
+#include <stdbool.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SM3_DIGEST_NWORDS		8	/* Word in SM3 is 32-bit */
+#define SM3_MAX_LANES			16
+#define SM3_X8_LANES                   8
+#define SM3_BLOCK_SIZE			64
+#define SM3_LOG2_BLOCK_SIZE			6
+#define SM3_PADLENGTHFIELD_SIZE		8
+#define SM3_INITIAL_DIGEST		\
+	0x7380166f, 0x4914b2b9, 0x172442d7, 0xda8a0600, \
+	0xa96f30bc, 0x163138aa, 0xe38dee4d, 0xb0fb0e4e
+
+typedef uint32_t sm3_digest_array[SM3_DIGEST_NWORDS][SM3_MAX_LANES];
+typedef uint32_t SM3_WORD_T;
+
+/** @brief Scheduler layer - Holds info describing a single SM3 job for the multi-buffer manager */
+
+typedef struct {
+	uint8_t *buffer;	//!< pointer to data buffer for this job
+	uint64_t len;	//!< length of buffer for this job in blocks.
+	DECLARE_ALIGNED(uint32_t result_digest[SM3_DIGEST_NWORDS], 64);
+	JOB_STS status;	//!< output job status
+	void *user_data;	//!< pointer for user's job-related data
+} SM3_JOB;
+
+/** @brief Scheduler layer -  Holds arguments for submitted SM3 job */
+
+typedef struct {
+	sm3_digest_array digest;
+	uint8_t *data_ptr[SM3_MAX_LANES];
+} SM3_MB_ARGS_X16;
+
+/** @brief Scheduler layer - Lane data */
+
+typedef struct {
+	SM3_JOB *job_in_lane;
+} SM3_LANE_DATA;
+
+/** @brief Scheduler layer - Holds state for multi-buffer SM3 jobs */
+
+typedef struct {
+	SM3_MB_ARGS_X16 args;
+	uint32_t lens[SM3_MAX_LANES];
+	uint64_t unused_lanes;	//!< each nibble is index (0...3 or 0...7) of unused lanes, nibble 4 or 8 is set to F as a flag
+	SM3_LANE_DATA ldata[SM3_MAX_LANES];
+	uint32_t num_lanes_inuse;
+} SM3_MB_JOB_MGR;
+
+/** @brief Context layer - Holds state for multi-buffer SM3 jobs */
+
+typedef struct {
+	SM3_MB_JOB_MGR mgr;
+} SM3_HASH_CTX_MGR;
+
+/** @brief Context layer - Holds info describing a single SM3 job for the multi-buffer CTX manager */
+
+typedef struct {
+	SM3_JOB job;	// Must be at struct offset 0.
+	HASH_CTX_STS status;	//!< Context status flag
+	HASH_CTX_ERROR error;	//!< Context error flag
+	uint64_t total_length;	//!< Running counter of length processed for this CTX's job
+	const void *incoming_buffer;	//!< pointer to data input buffer for this CTX's job
+	uint32_t incoming_buffer_length;	//!< length of buffer for this job in bytes.
+	uint8_t partial_block_buffer[SM3_BLOCK_SIZE * 2];	//!< CTX partial blocks
+	uint32_t partial_block_buffer_length;
+	void *user_data;	//!< pointer for user to keep any job-related data
+} SM3_HASH_CTX;
+
+/******************** multibinary function prototypes **********************/
+
+/**
+* @brief Initialize the SM3 multi-buffer manager structure.
+*
+* @param mgr	Structure holding context level state info
+* @returns void
+*/
+void sm3_ctx_mgr_init(SM3_HASH_CTX_MGR * mgr);
+
+/**
+* @brief  Submit a new SM3 job to the multi-buffer manager.
+*
+* @param  mgr Structure holding context level state info
+* @param  ctx Structure holding ctx job info
+* @param  buffer Pointer to buffer to be processed
+* @param  len Length of buffer (in bytes) to be processed
+* @param  flags Input flag specifying job type (first, update, last or entire)
+* @returns NULL if no jobs complete or pointer to jobs structure.
+*/
+SM3_HASH_CTX *sm3_ctx_mgr_submit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+				 const void *buffer, uint32_t len,
+				 HASH_CTX_FLAG flags);
+
+/**
+* @brief Finish all submitted SM3 jobs and return when complete.
+*
+* @param mgr	Structure holding context level state info
+* @returns NULL if no jobs to complete or pointer to jobs structure.
+*/
+SM3_HASH_CTX *sm3_ctx_mgr_flush(SM3_HASH_CTX_MGR * mgr);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/include/test.h b/src/crypto/isa-l/isa-l_crypto/include/test.h
new file mode 100644
index 000000000..7b99390b8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/test.h
@@ -0,0 +1,111 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _TEST_H
+#define _TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "endian_helper.h"
+
+// Use sys/time.h functions for time
+#if defined (__unix__) || (__APPLE__) || (__MINGW32__)
+# include <sys/time.h>
+#endif
+
+#ifdef _MSC_VER
+# define inline __inline
+# include <time.h>
+# include <Windows.h>
+#endif
+
+#include <stdio.h>
+#include <stdint.h>
+
+struct perf{
+	struct timeval tv;
+};
+
+
+#if defined (__unix__) || (__APPLE__) || (__MINGW32__)
+static inline int perf_start(struct perf *p)
+{
+	return gettimeofday(&(p->tv), 0);
+}
+static inline int perf_stop(struct perf *p)
+{
+	return gettimeofday(&(p->tv), 0);
+}
+
+static inline void perf_print(struct perf stop, struct perf start, long long dsize)
+{
+	long long secs = stop.tv.tv_sec - start.tv.tv_sec;
+	long long usecs = secs * 1000000 + stop.tv.tv_usec - start.tv.tv_usec;
+
+	printf("runtime = %10lld usecs", usecs);
+	if (dsize != 0) {
+#if 1 // not bug in printf for 32-bit
+		printf(", bandwidth %lld MB in %.4f sec = %.2f MB/s\n", dsize/(1024*1024),
+			((double) usecs)/1000000, ((double) dsize) / (double)usecs);
+#else
+		printf(", bandwidth %lld MB ", dsize/(1024*1024));
+		printf("in %.4f sec ",(double)usecs/1000000);
+		printf("= %.2f MB/s\n", (double)dsize/usecs);
+#endif
+	}
+	else
+		printf("\n");
+}
+#endif
+
+static inline uint64_t get_filesize(FILE *fp)
+{
+	uint64_t file_size;
+	fpos_t pos, pos_curr;
+
+	fgetpos(fp, &pos_curr);  /* Save current position */
+#if defined(_WIN32) || defined(_WIN64)
+	_fseeki64(fp, 0, SEEK_END);
+#else
+	fseeko(fp, 0, SEEK_END);
+#endif
+	fgetpos(fp, &pos);
+	file_size = *(uint64_t *)&pos;
+	fsetpos(fp, &pos_curr);  /* Restore position */
+
+	return file_size;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _TEST_H
diff --git a/src/crypto/isa-l/isa-l_crypto/include/types.h b/src/crypto/isa-l/isa-l_crypto/include/types.h
new file mode 100644
index 000000000..de452557a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/types.h
@@ -0,0 +1,100 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+/**
+ *  @file  types.h
+ *  @brief Defines common align and debug macros
+ *
+ */
+
+#ifndef __TYPES_H
+#define __TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#if defined  __unix__ || defined __APPLE__
+# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval)))
+# define __forceinline static inline
+# define aligned_free(x) free(x)
+#else
+# ifdef __MINGW32__
+#   define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval)))
+#   define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
+#   define aligned_free(x) _aligned_free(x)
+# else
+#   define DECLARE_ALIGNED(decl, alignval) __declspec(align(alignval)) decl
+#   define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
+#   define aligned_free(x) _aligned_free(x)
+# endif
+#endif
+
+#ifdef DEBUG
+# define DEBUG_PRINT(x) printf x
+#else
+# define DEBUG_PRINT(x) do {} while (0)
+#endif
+
+
+#ifndef __has_feature
+# define __has_feature(x) 0
+#endif
+#ifndef __has_extension
+# define __has_extension __has_feature
+#endif
+#define ISAL_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+
+#if (defined(__ICC) || defined( __GNUC__ ) || defined(__clang__)) && !defined(ISAL_UNIT_TEST)
+# if __has_extension(attribute_deprecated_with_message) \
+	|| (ISAL_GCC_VERSION >= 40500) \
+	|| (__INTEL_COMPILER >= 1100)
+#   define ISAL_DEPRECATED(message) __attribute__(( deprecated( message )))
+# else
+#   define ISAL_DEPRECATED(message) __attribute__(( deprecated ))
+# endif
+#elif (defined( __ICL ) || defined(_MSC_VER))
+# if (__INTEL_COMPILER >= 1100) || (_MSC_FULL_VER >= 140050727)
+#   define ISAL_DEPRECATED(message) __declspec( deprecated ( message ))
+# else
+#   define ISAL_DEPRECATED(message) __declspec( deprecated )
+# endif
+#else
+# define ISAL_DEPRECATED(message)
+#endif
+
+#define ISAL_EXPERIMENTAL(message) ISAL_DEPRECATED("Experimental: " message)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  //__TYPES_H
diff --git a/src/crypto/isa-l/isa-l_crypto/isa-l_crypto.def b/src/crypto/isa-l/isa-l_crypto/isa-l_crypto.def
new file mode 100644
index 000000000..de38b6d19
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/isa-l_crypto.def
@@ -0,0 +1,80 @@
+LIBRARY  isa-l_crypto
+VERSION  2.24
+EXPORTS
+
+sha1_ctx_mgr_init                      @1
+sha1_ctx_mgr_submit                    @2
+sha1_ctx_mgr_flush                     @3
+sha256_ctx_mgr_init                    @4
+sha256_ctx_mgr_submit                  @5
+sha256_ctx_mgr_flush                   @6
+sha512_ctx_mgr_init                    @7
+sha512_ctx_mgr_submit                  @8
+sha512_ctx_mgr_flush                   @9
+md5_ctx_mgr_init                       @10
+md5_ctx_mgr_submit                     @11
+md5_ctx_mgr_flush                      @12
+mh_sha1_init                           @13
+mh_sha1_update                         @14
+mh_sha1_finalize                       @15
+mh_sha1_finalize_base                  @16
+mh_sha1_update_base                    @17
+mh_sha1_murmur3_x64_128_init           @18
+mh_sha1_murmur3_x64_128_finalize_base  @19
+mh_sha1_murmur3_x64_128_update_base    @20
+mh_sha1_murmur3_x64_128_update         @21
+mh_sha1_murmur3_x64_128_finalize       @22
+aes_keyexp_128                         @23
+aes_keyexp_192                         @24
+aes_keyexp_256                         @25
+aes_cbc_enc_128                        @26
+aes_cbc_dec_128                        @27
+aes_cbc_enc_192                        @28
+aes_cbc_dec_192                        @29
+aes_cbc_enc_256                        @30
+aes_cbc_dec_256                        @31
+aes_cbc_precomp                        @32
+XTS_AES_128_enc                        @33
+XTS_AES_128_enc_expanded_key           @34
+XTS_AES_128_dec                        @35
+XTS_AES_128_dec_expanded_key           @36
+XTS_AES_256_enc                        @37
+XTS_AES_256_enc_expanded_key           @38
+XTS_AES_256_dec                        @39
+XTS_AES_256_dec_expanded_key           @40
+mh_sha256_init                         @41
+mh_sha256_update                       @42
+mh_sha256_finalize                     @43
+mh_sha256_finalize_base                @44
+mh_sha256_update_base                  @45
+rolling_hashx_mask_gen                 @46
+rolling_hash2_run                      @47
+rolling_hash2_reset                    @48
+rolling_hash2_init                     @49
+aes_gcm_pre_128                        @50
+aes_gcm_enc_128                        @51
+aes_gcm_dec_128                        @52
+aes_gcm_init_128                       @53
+aes_gcm_enc_128_update                 @54
+aes_gcm_dec_128_update                 @55
+aes_gcm_enc_128_finalize               @56
+aes_gcm_dec_128_finalize               @57
+aes_gcm_pre_256                        @58
+aes_gcm_enc_256                        @59
+aes_gcm_dec_256                        @60
+aes_gcm_init_256                       @61
+aes_gcm_enc_256_update                 @62
+aes_gcm_dec_256_update                 @63
+aes_gcm_enc_256_finalize               @64
+aes_gcm_dec_256_finalize               @65
+aes_gcm_enc_128_nt                     @66
+aes_gcm_dec_128_nt                     @67
+aes_gcm_enc_128_update_nt              @68
+aes_gcm_dec_128_update_nt              @69
+aes_gcm_enc_256_nt                     @70
+aes_gcm_dec_256_nt                     @71
+aes_gcm_enc_256_update_nt              @72
+aes_gcm_dec_256_update_nt              @73
+sm3_ctx_mgr_init		       @74
+sm3_ctx_mgr_submit		       @75
+sm3_ctx_mgr_flush		       @76
diff --git a/src/crypto/isa-l/isa-l_crypto/libisal_crypto.pc.in b/src/crypto/isa-l/isa-l_crypto/libisal_crypto.pc.in
new file mode 100644
index 000000000..41ba8d5a3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/libisal_crypto.pc.in
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: libisal_crypto
+Description: Crypto library for storage systems
+Version: @VERSION@
+Libs: -L${libdir} -lisal_crypto
+Libs.private:
+Cflags: -I${includedir}
diff --git a/src/crypto/isa-l/isa-l_crypto/make.inc b/src/crypto/isa-l/isa-l_crypto/make.inc
new file mode 100644
index 000000000..0cb94d12d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/make.inc
@@ -0,0 +1,340 @@
+########################################################################
+#  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+
+# Makefile include for optimized libraries
+#  make targets:
+#	lib  - build library of optimized functions
+#	slib - build shared library
+#	test - run unit tests of functions
+#	perf - run performance tests
+#	install - install headers and libs to system location
+#	sim  - run on simulator
+#	trace - get simulator trace
+#	clean - remove object files
+
+version ?= 2.24.0
+
+
+
+CC  = gcc
+AS  = nasm
+AWK = awk
+
+DEBUG      = -g
+DEBUG_yasm = -g dwarf2
+DEBUG_nasm = -g
+
+# Default arch= build options
+CFLAGS_    = -Wall
+ASFLAGS_   = -f elf64
+ARFLAGS_   = cr $@
+STRIP_gcc  = strip -d -R .comment $@
+
+# arch=32 build options
+ASFLAGS_32 = -f elf32
+CFLAGS_32  = -m32
+ARFLAGS_32 = cr $@
+
+# arch=win64 build options
+ASFLAGS_win64 = -f win64
+CFLAGS_icl    = -Qstd=c99
+ARFLAGS_win64 = -out:$@
+
+# arch=mingw build options
+ASFLAGS_mingw = -f win64
+ARFLAGS_mingw = cr $@
+
+LDFLAGS_so = -Wl,-soname,$(soname)
+
+ifeq ($(arch),mingw)
+  CC=x86_64-w64-mingw32-gcc
+  AR=x86_64-w64-mingw32-ar
+  LDFLAGS += -Wl,--force-exe-suffix
+  SIM=wine
+  EXT=.exe
+  CLEANFILES+=*.exe
+endif
+
+ASFLAGS_Darwin = -f macho64 --prefix=_
+ARFLAGS_Darwin = -r $@
+ifeq ($(shell uname),Darwin)
+   LDFLAGS_so =
+   STRIP_gcc  =
+endif
+
+# arch=aarch64 build options
+ifeq ($(lib_debug),1)
+  ASFLAGS_aarch64 = -g -c
+else
+  ASFLAGS_aarch64 = -c
+endif
+
+ARFLAGS_aarch64 = cr $@
+ifeq ($(arch),aarch64)
+  AS=$(CC) -D__ASSEMBLY__
+  SIM=
+endif
+# arch=noarch build options
+ARFLAGS_noarch = cr $@
+CFLAGS_noarch= -DNOARCH
+ifeq ($(arch),noarch)
+  host_cpu=base_aliases
+endif
+INCLUDE   = $(patsubst %,-I%/,$(subst :, ,$(VPATH)))
+CFLAGS   = $(CFLAGS_$(arch)) $(CFLAGS_$(CC)) $(DEBUG) -O2 $(DEFINES) $(INCLUDE)
+ASFLAGS  = $(ASFLAGS_$(arch)) $(ASFLAGS_$(CC)) $(DEBUG_$(AS)) $(DEFINES) $(INCLUDE)
+ARFLAGS  = $(ARFLAGS_$(arch))
+DEFINES += $(addprefix -D , $D)
+CLEANFILES += $(O) *.o *.a $(all_tests) $(lib_name) $(so_lib_name)
+
+ifeq ($(filter aarch64 x86_%,$(host_cpu)),)
+  host_cpu=base_aliases
+endif
+lsrc += $(lsrc_$(host_cpu))
+O = bin
+lobj  += $(patsubst %.c,%.o,$(patsubst %.S,%.o,$(patsubst %.asm,%.o,$(lsrc) $(lsrc_intrinsic))))
+objs  = $(addprefix $(O)/,$(notdir $(lobj)))
+
+
+lib_name ?= isa-l_crypto.a
+default: lib slib
+
+# Defaults for windows build
+ifeq ($(arch),win64)
+  AR=lib
+  CC=cl
+  OUTPUT_OPTION = -Fo$@
+  DEBUG=
+  lib_name := $(basename $(lib_name)).lib
+endif
+lsrcwin64 = $(lsrc)
+unit_testswin64 = $(unit_tests)
+exampleswin64 = $(examples)
+perf_testswin64 = $(perf_tests)
+
+
+# Build and run unit tests, performance tests, etc.
+all_tests = $(notdir $(sort $(perf_tests) $(check_tests) $(unit_tests) $(examples) $(other_tests)))
+all_unit_tests = $(notdir $(sort $(check_tests) $(unit_tests)))
+all_perf_tests = $(notdir $(sort $(perf_tests)))
+all_check_tests = $(notdir $(sort $(check_tests)))
+
+$(all_unit_tests): % : %.c $(lib_name)
+$(all_perf_tests): % : %.c $(lib_name)
+$(sort $(notdir $(examples))): % : %.c $(lib_name)
+$(sort $(notdir $(other_tests))): % : %.c $(lib_name)
+
+# Check for modern as
+test-as = $(shell hash printf && printf $(3) > $(2) && $(AS) $(ASFLAGS) ${tmpf} -o /dev/null 2> /dev/null && echo $(1) || echo $(4))
+as_4  := "pblendvb xmm2, xmm1;"
+as_6  := "vinserti32x8 zmm0, ymm1, 1;"
+as_10 := "vpcompressb zmm0 {k1}, zmm1;"
+
+tmpf := $(shell mktemp)
+as_feature_level := $(call test-as, 4,  $(tmpf), $(as_4),  $(as_feature_level))
+as_feature_level := $(call test-as, 6,  $(tmpf), $(as_6),  $(as_feature_level))
+as_feature_level := $(call test-as, 10, $(tmpf), $(as_10), $(as_feature_level))
+tmpf := $(shell rm ${tmpf})
+
+ifneq ($(findstring $(as_feature_level),6 10),)
+  D_HAVE_AS_KNOWS_AVX512_y := -DHAVE_AS_KNOWS_AVX512
+endif
+
+CFLAGS  += -DAS_FEATURE_LEVEL=$(as_feature_level) $(D_HAVE_AS_KNOWS_AVX512_y)
+ASFLAGS += -DAS_FEATURE_LEVEL=$(as_feature_level) $(D_HAVE_AS_KNOWS_AVX512_y)
+
+sim test trace: $(addsuffix .run,$(all_unit_tests))
+perf: $(addsuffix .run,$(all_perf_tests))
+check: $(addsuffix .run,$(all_check_tests))
+ex:   $(notdir $(examples))
+all: lib $(all_tests)
+other: $(notdir $(other_tests))
+tests: $(all_unit_tests)
+perfs: $(all_perf_tests)
+checks: $(all_check_tests)
+trace: SIM=sde -debugtrace --
+sim: SIM=sde --
+check test sim:
+	@echo Finished running $@
+
+$(objs): | $(O)
+$(O): ; mkdir -p $(O)
+
+# Build rule to run tests
+$(addsuffix .run,$(all_tests)): %.run : %
+	$(SIM) ./$<$(EXT)
+	@echo Completed run: $<
+
+# Other build rules
+msg = $(if $(DEBUG),DEBUG) $(patsubst 32,32-bit,$(host_cpu)) $D
+
+# gcc assembly files
+$(O)/%.o: $(host_cpu)/%.S
+	@echo "  ---> Building $< $(msg)"
+	@$(AS) $(ASFLAGS) -o $@ $<
+
+$(O)/%.o   : $(host_cpu)/%.c
+	@echo "  ---> Building $< $(msg)"
+	@$(COMPILE.c) $(OUTPUT_OPTION) $<
+# yasm/nasm assembly files
+$(O)/%.o: %.asm
+	@echo "  ---> Building $< $(msg)"
+	@$(AS) $(ASFLAGS) -o $@ $<
+
+
+$(O)/%.o %.o: %.c
+	@echo "  ---> Building $< $(msg)"
+	@$(COMPILE.c) $(OUTPUT_OPTION) $<
+
+$(all_tests):
+	@echo "  ---> Building Test $@ $(msg)"
+	@$(LINK.o) $(CFLAGS)  $^ $(LDLIBS) -o $@
+
+
+# Target to build lib files
+lib: $(lib_name)
+ifneq ($(lib_debug),1)
+ $(lib_name): DEBUG_$(AS)=	# Don't put debug symbols in the lib
+ $(lib_name): DEBUG=
+ $(lib_name): DEFINES+=-D NDEBUG
+endif
+ifeq ($(lib_debug),1)
+ DEBUG+=-D DEBUG   # Define DEBUG for macros
+endif
+
+#lib $(lib_name): $(lib_name)(${objs})
+$(lib_name): $(objs)
+	@echo "  ---> Creating Lib $@"
+	@$(AR) $(ARFLAGS) $^
+ifneq ($(lib_debug),1)
+	@$(STRIP_$(CC))
+endif
+
+
+# Target for shared lib
+so_lib_name = bin/libisal_crypto.so
+so_lib_inst = $(notdir $(so_lib_name))
+so_lib_ver  = $(so_lib_inst).$(version)
+soname      = $(so_lib_inst).$(word 1, $(subst ., ,$(version)))
+
+slib: $(so_lib_name)
+aobjs  += $(addprefix $(O)/,$(patsubst %.asm,%.o,$(filter %.asm,$(notdir $(lsrc) $(lsrc_intrinsic)))))
+aobjs  += $(addprefix $(O)/,$(patsubst %.S,%.o,$(filter %.S,$(notdir $(lsrc) $(lsrc_intrinsic)))))
+shared_objs  += $(addprefix $(O)/shared_ver_,$(patsubst %.c,%.o,$(filter %.c,$(notdir $(lsrc) $(lsrc_intrinsic)))))
+
+$(O)/shared_ver_%.o: %.c
+	@echo "  ---> Building shared $< $(msg)"
+	@$(COMPILE.c) $(OUTPUT_OPTION) $<
+
+$(O)/shared_ver_%.o: $(host_cpu)/%.c
+	@echo "  ---> Building shared $< $(msg)"
+	@$(COMPILE.c) $(OUTPUT_OPTION) $<
+ifneq ($(lib_debug),1)
+ $(so_lib_name): DEBUG_$(AS)=
+ $(so_lib_name): DEBUG=
+ $(so_lib_name): DEFINES+=-D NDEBUG
+endif
+
+$(shared_objs): CFLAGS += -fPIC
+$(shared_objs) $(aobjs): | $(O)
+$(so_lib_name): LDFLAGS+=$(LDFLAGS_so)
+$(so_lib_name): $(shared_objs) $(aobjs)
+	@echo "  ---> Creating Shared Lib $@"
+	@$(CC) $(CFLAGS) --shared  $(LDFLAGS) -o $@ $^
+	@(cd $(@D); ln -f -s $(so_lib_inst) $(soname))
+
+isa-l_crypto.h:
+	@echo 'Building $@'
+	@echo ''			>> $@
+	@echo '/**'			>> $@
+	@echo ' *  @file isa-l_crypto.h'>> $@
+	@echo ' *  @brief Include for ISA-L_crypto library'	>> $@
+	@echo ' */'			>> $@
+	@echo ''			>> $@
+	@echo '#ifndef _ISAL_CRYPTO_H_'	>> $@
+	@echo '#define _ISAL_CRYPTO_H_'	>> $@
+	@echo ''			>> $@
+	@echo '#define.ISAL_CRYPTO_MAJOR_VERSION.${version}' |  ${AWK} -F . '{print $$1, $$2, $$3}' >> $@
+	@echo '#define.ISAL_CRYPTO_MINOR_VERSION.${version}' |  ${AWK} -F . '{print $$1, $$2, $$4}' >> $@
+	@echo '#define.ISAL_CRYPTO_PATCH_VERSION.${version}' |  ${AWK} -F . '{print $$1, $$2, $$5}' >> $@
+	@echo '#define ISAL_CRYPTO_MAKE_VERSION(maj, min, patch)  ((maj) * 0x10000 + (min) * 0x100 + (patch))' >> $@
+	@echo '#define ISAL_CRYPTO_VERSION ISAL_CRYPTO_MAKE_VERSION(ISAL_CRYPTO_MAJOR_VERSION, ISAL_CRYPTO_MINOR_VERSION, ISAL_CRYPTO_PATCH_VERSION)' >> $@
+	@echo ''			>> $@
+	@for unit in $(sort $(extern_hdrs)); do echo "#include <isa-l_crypto/$$unit>" | sed -e 's;include/;;' >> $@; done
+	@echo '#endif //_ISAL_CRYPTO_H_'	>> $@
+
+
+# Target for install
+prefix = /usr/local
+install_dirs = $(prefix)/lib $(prefix)/include/isa-l_crypto
+$(install_dirs): ; mkdir -p $@
+install: $(sort $(extern_hdrs)) | $(install_dirs) $(lib_name) $(so_lib_name) isa-l_crypto.h
+	install -m 644 $(lib_name) $(prefix)/lib/libisal_crypto.a
+	install -m 644 $^ $(prefix)/include/isa-l_crypto/.
+	install -m 664 isa-l_crypto.h $(prefix)/include/.
+	install -m 664 include/types.h $(prefix)/include/isa-l_crypto/.
+	install -m 664 include/endian_helper.h $(prefix)/include/isa-l_crypto/.
+	install -m 664 $(so_lib_name) $(prefix)/lib/$(so_lib_ver)
+	(cd $(prefix)/lib && ln -f -s $(so_lib_ver) $(soname) && ln -f -s $(so_lib_ver) $(so_lib_inst))
+ifeq ($(shell uname),Darwin)
+	(cd $(prefix)/lib && ln -f -s $(so_lib_ver) $(basename $(so_lib_inst)).dylib)
+	which glibtool && glibtool --mode=finish $(prefix)/lib
+else
+	which libtool && libtool --mode=finish $(prefix)/lib || \
+	echo 'Lib installed at $(prefix)/lib. Run system-dependent programs to add shared lib path.'
+endif
+
+uninstall:
+	$(RM) $(prefix)/lib/libisal_crypto.a
+	$(RM) $(prefix)/lib/$(soname)
+	$(RM) $(prefix)/lib/$(so_lib_ver)
+	$(RM) $(prefix)/lib/$(so_lib_inst)
+	$(RM) -r $(prefix)/include/isa-l_crypto
+	$(RM) $(prefix)/include/isa-l_crypto.h
+	$(RM) $(prefix)/lib/$(basename $(so_lib_inst)).dylib
+
+# Collect performance data
+rpt_name = perf_report_$(shell uname -n)_$(shell date +%y%m%d).perf
+
+perf_report:
+	echo Results for $(rpt_name) >> $(rpt_name)
+	$(MAKE) -f Makefile.unx -k perf | tee -a $(rpt_name)
+	@echo Summary:
+	-grep runtime $(rpt_name)
+
+
+clean:
+	@echo Cleaning up
+	@$(RM) -r $(CLEANFILES)
+
+doc: isa-l_crypto.h
+	(cat Doxyfile; echo 'PROJECT_NUMBER=$(version)') | doxygen -
+	$(MAKE) -C generated_doc/latex &> generated_doc/latex_build_api.log
+	cp generated_doc/latex/refman.pdf isa-l_crypto_api_$(version).pdf
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am
new file mode 100644
index 000000000..423f12945
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am
@@ -0,0 +1,98 @@
+########################################################################
+#  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_x86_64 +=	md5_mb/md5_ctx_sse.c \
+		md5_mb/md5_ctx_avx.c \
+		md5_mb/md5_ctx_avx2.c \
+		md5_mb/md5_ctx_base.c
+
+lsrc_x86_64 += 	md5_mb/md5_mb_mgr_init_sse.c \
+		md5_mb/md5_mb_mgr_init_avx2.c \
+		md5_mb/md5_mb_mgr_init_avx512.c
+
+lsrc_x86_64 += 	md5_mb/md5_mb_mgr_submit_sse.asm \
+		md5_mb/md5_mb_mgr_submit_avx.asm \
+		md5_mb/md5_mb_mgr_submit_avx2.asm \
+		md5_mb/md5_mb_mgr_flush_sse.asm \
+		md5_mb/md5_mb_mgr_flush_avx.asm \
+		md5_mb/md5_mb_mgr_flush_avx2.asm \
+		md5_mb/md5_mb_x4x2_sse.asm \
+		md5_mb/md5_mb_x4x2_avx.asm \
+		md5_mb/md5_mb_x8x2_avx2.asm \
+		md5_mb/md5_multibinary.asm
+
+lsrc_x86_64 += 	md5_mb/md5_mb_mgr_submit_avx512.asm \
+		md5_mb/md5_mb_mgr_flush_avx512.asm \
+		md5_mb/md5_mb_x16x2_avx512.asm \
+		md5_mb/md5_ctx_avx512.c
+
+lsrc_x86_32 += $(lsrc_x86_64)
+
+lsrc_aarch64 += md5_mb/md5_ctx_base.c \
+		md5_mb/aarch64/md5_ctx_aarch64_asimd.c  \
+		md5_mb/aarch64/md5_mb_aarch64_dispatcher.c  \
+		md5_mb/aarch64/md5_mb_mgr_aarch64_asimd.c  \
+		md5_mb/aarch64/md5_mb_asimd_x4.S  \
+		md5_mb/aarch64/md5_mb_asimd_x1.S  \
+		md5_mb/aarch64/md5_mb_multibinary.S
+
+
+lsrc_base_aliases += md5_mb/md5_ctx_base.c \
+		md5_mb/md5_ctx_base_aliases.c
+src_include  += -I $(srcdir)/md5_mb
+extern_hdrs  += include/md5_mb.h \
+		include/multi_buffer.h
+
+other_src += 	include/datastruct.asm \
+		md5_mb/md5_job.asm \
+		md5_mb/md5_mb_mgr_datastruct.asm \
+		md5_mb/md5_ref.c \
+		include/reg_sizes.asm \
+		include/multibinary.asm \
+		include/memcpy_inline.h \
+		include/intrinreg.h
+
+check_tests  += md5_mb/md5_mb_test \
+		md5_mb/md5_mb_rand_test \
+		md5_mb/md5_mb_rand_update_test
+
+unit_tests  += md5_mb/md5_mb_rand_ssl_test
+
+perf_tests  +=  md5_mb/md5_mb_vs_ossl_perf
+
+
+md5_mb_rand_test: md5_ref.o
+md5_mb_md5_mb_rand_test_LDADD = md5_mb/md5_ref.lo libisal_crypto.la
+md5_mb_rand_update_test: md5_ref.o
+md5_mb_md5_mb_rand_update_test_LDADD = md5_mb/md5_ref.lo libisal_crypto.la
+md5_mb_rand_ssl_test: LDLIBS += -lcrypto
+md5_mb_md5_mb_rand_ssl_test_LDFLAGS = -lcrypto
+md5_mb_vs_ossl_perf: LDLIBS += -lcrypto
+md5_mb_md5_mb_vs_ossl_perf_LDFLAGS = -lcrypto
+
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_ctx_aarch64_asimd.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_ctx_aarch64_asimd.c
new file mode 100644
index 000000000..e9a708c17
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_ctx_aarch64_asimd.c
@@ -0,0 +1,230 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdlib.h>
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+void md5_mb_mgr_init_asimd(MD5_MB_JOB_MGR * state);
+MD5_JOB *md5_mb_mgr_submit_asimd(MD5_MB_JOB_MGR * state, MD5_JOB * job);
+MD5_JOB *md5_mb_mgr_flush_asimd(MD5_MB_JOB_MGR * state);
+
+static inline void hash_init_digest(MD5_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len);
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx);
+
+void md5_ctx_mgr_init_asimd(MD5_HASH_CTX_MGR * mgr)
+{
+	md5_mb_mgr_init_asimd(&mgr->mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_asimd(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+				       const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return md5_ctx_mgr_resubmit(mgr, ctx);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_asimd(MD5_HASH_CTX_MGR * mgr)
+{
+	MD5_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_asimd(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = md5_ctx_mgr_resubmit(mgr, ctx);
+
+		// If md5_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx)
+{
+	while (ctx) {
+
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				//memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len);
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % MD5_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= MD5_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_asimd(&mgr->mgr,
+									       &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+	static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+	    { MD5_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (MD5_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 +
+	    MD5_PADLENGTHFIELD_SIZE;
+
+	*((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3);
+
+	return i >> MD5_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_aarch64_dispatcher.c
new file mode 100644
index 000000000..14ef3a6e6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_aarch64_dispatcher.c
@@ -0,0 +1,59 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(md5_ctx_mgr_submit)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_ASIMD)
+		return PROVIDER_INFO(md5_ctx_mgr_submit_asimd);
+
+	return PROVIDER_BASIC(md5_ctx_mgr_submit);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(md5_ctx_mgr_init)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_ASIMD)
+		return PROVIDER_INFO(md5_ctx_mgr_init_asimd);
+
+	return PROVIDER_BASIC(md5_ctx_mgr_init);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(md5_ctx_mgr_flush)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_ASIMD)
+		return PROVIDER_INFO(md5_ctx_mgr_flush_asimd);
+
+	return PROVIDER_BASIC(md5_ctx_mgr_flush);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x1.S b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x1.S
new file mode 100644
index 000000000..27d112494
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x1.S
@@ -0,0 +1,248 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a
+
+/*
+Macros
+*/
+
+.macro	declare_var_vector_reg name:req,reg:req
+	q_\name	.req	q\reg
+	v_\name	.req	v\reg
+	s_\name	.req	s\reg
+.endm
+
+
+.macro	round_0_15	d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req
+	eor	tmp0,\d_c,\d_d
+	mov	k,\kl
+	and	tmp0,tmp0,\d_b
+	movk	k,\kh,lsl 16
+	eor	tmp0,tmp0,\d_d
+	add	tmp1,k,\w
+	add	tmp0,tmp1,tmp0
+	add	tmp0,\d_a,tmp0
+	ror	tmp0,tmp0,32 - \r
+	add	\d_a,\d_b,tmp0
+.endm
+
+.macro	round_16_31	d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req
+	eor	tmp0,\d_b,\d_c
+	mov	k,\kl
+	and	tmp0,tmp0,\d_d
+	movk	k,\kh,lsl 16
+	eor	tmp0,tmp0,\d_c
+	add	tmp1,k,\w
+	add	tmp0,tmp1,tmp0
+	add	tmp0,\d_a,tmp0
+	ror	tmp0,tmp0,32 - \r
+	add	\d_a,\d_b,tmp0
+.endm
+
+.macro	round_32_47	d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req
+	eor	tmp0,\d_b,\d_c
+	mov	k,\kl
+	eor	tmp0,tmp0,\d_d
+	movk	k,\kh,lsl 16
+	add	tmp1,k,\w
+	add	tmp0,tmp1,tmp0
+	add	tmp0,\d_a,tmp0
+	ror	tmp0,tmp0,32 - \r
+	add	\d_a,\d_b,tmp0
+.endm
+
+.macro	round_48_63	d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req
+	orn	tmp0,\d_b,\d_d
+	mov	k,\kl
+	eor	tmp0,tmp0,\d_c
+	movk	k,\kh,lsl 16
+	add	tmp1,k,\w
+	add	tmp0,tmp1,tmp0
+	add	tmp0,\d_a,tmp0
+	ror	tmp0,tmp0,32 - \r
+	add	\d_a,\d_b,tmp0
+.endm
+/*
+	variables
+*/
+	job0		.req	x0
+	digest_addr	.req	x0
+	len		.req	w1
+	end		.req	x1
+
+	buf_adr		.req	x2
+	d_a		.req	w3
+	d_b		.req	w4
+	d_c		.req	w5
+	d_d		.req	w6
+	k		.req	w7
+	m0		.req	w8
+	m1		.req	w9
+	m2		.req	w10
+	m3		.req	w11
+	m4		.req	w12
+	m5		.req	w13
+	m6		.req	w14
+	m7		.req	w15
+	m8		.req	w19
+	m9		.req	w20
+	m10		.req	w21
+	m11		.req	w22
+	m12		.req	w23
+	m13		.req	w24
+	m14		.req	w25
+	m15		.req	w26
+
+	tmp0		.req	w27
+	tmp1		.req	w28
+
+	d_a1		.req	w8
+	d_b1		.req	w9
+	d_c1		.req	w15
+	d_d1		.req	w19
+
+/*
+	void md5_mb_asimd_x1(MD5_JOB * job0,int len)
+*/
+	.global md5_mb_asimd_x1
+	.type md5_mb_asimd_x1, %function
+md5_mb_asimd_x1:
+	cmp	len,0
+	stp	x29, x30, [sp,-96]!
+	ldr	buf_adr,[job0],64
+	stp	x19, x20, [sp, 16]
+	add	end,buf_adr,end,lsl 6
+	stp	x21, x22, [sp, 32]
+	ldp	d_a,d_b,[digest_addr]
+	stp	x23, x24, [sp, 48]
+	ldp	d_c,d_d,[digest_addr,8]
+	stp	x25, x26, [sp, 64]
+	stp	x27, x28, [sp, 80]
+	ble	.exit
+
+.loop_start:
+	ldp		m0,m1,[buf_adr],8
+	ldp		m2,m3,[buf_adr],8
+	round_0_15	d_a,d_b,d_c,d_d,0xd76a,0xa478,m0,7
+
+	ldp		m4,m5,[buf_adr],8
+	round_0_15	d_d,d_a,d_b,d_c,0xe8c7,0xb756,m1,12
+	ldp		m6,m7,[buf_adr],8
+	round_0_15	d_c,d_d,d_a,d_b,0x2420,0x70db,m2,17
+	ldp		m8,m9,[buf_adr],8
+	round_0_15	d_b,d_c,d_d,d_a,0xc1bd,0xceee,m3,22
+	ldp		m10,m11,[buf_adr],8
+	round_0_15	d_a,d_b,d_c,d_d,0xf57c,0xfaf,m4,7
+	ldp		m12,m13,[buf_adr],8
+	round_0_15	d_d,d_a,d_b,d_c,0x4787,0xc62a,m5,12
+	ldp		m14,m15,[buf_adr],8
+	round_0_15	d_c,d_d,d_a,d_b,0xa830,0x4613,m6,17
+	round_0_15	d_b,d_c,d_d,d_a,0xfd46,0x9501,m7,22
+	round_0_15	d_a,d_b,d_c,d_d,0x6980,0x98d8,m8,7
+	round_0_15	d_d,d_a,d_b,d_c,0x8b44,0xf7af,m9,12
+	round_0_15	d_c,d_d,d_a,d_b,0xffff,0x5bb1,m10,17
+	round_0_15	d_b,d_c,d_d,d_a,0x895c,0xd7be,m11,22
+	round_0_15	d_a,d_b,d_c,d_d,0x6b90,0x1122,m12,7
+	round_0_15	d_d,d_a,d_b,d_c,0xfd98,0x7193,m13,12
+	round_0_15	d_c,d_d,d_a,d_b,0xa679,0x438e,m14,17
+	round_0_15	d_b,d_c,d_d,d_a,0x49b4,0x821,m15,22
+
+	round_16_31	d_a,d_b,d_c,d_d,0xf61e,0x2562,m1,5
+	round_16_31	d_d,d_a,d_b,d_c,0xc040,0xb340,m6,9
+	round_16_31	d_c,d_d,d_a,d_b,0x265e,0x5a51,m11,14
+	round_16_31	d_b,d_c,d_d,d_a,0xe9b6,0xc7aa,m0,20
+	round_16_31	d_a,d_b,d_c,d_d,0xd62f,0x105d,m5,5
+	round_16_31	d_d,d_a,d_b,d_c,0x244,0x1453,m10,9
+	round_16_31	d_c,d_d,d_a,d_b,0xd8a1,0xe681,m15,14
+	round_16_31	d_b,d_c,d_d,d_a,0xe7d3,0xfbc8,m4,20
+	round_16_31	d_a,d_b,d_c,d_d,0x21e1,0xcde6,m9,5
+	round_16_31	d_d,d_a,d_b,d_c,0xc337,0x7d6,m14,9
+	round_16_31	d_c,d_d,d_a,d_b,0xf4d5,0xd87,m3,14
+	round_16_31	d_b,d_c,d_d,d_a,0x455a,0x14ed,m8,20
+	round_16_31	d_a,d_b,d_c,d_d,0xa9e3,0xe905,m13,5
+	round_16_31	d_d,d_a,d_b,d_c,0xfcef,0xa3f8,m2,9
+	round_16_31	d_c,d_d,d_a,d_b,0x676f,0x2d9,m7,14
+	round_16_31	d_b,d_c,d_d,d_a,0x8d2a,0x4c8a,m12,20
+
+	round_32_47	d_a,d_b,d_c,d_d,0xfffa,0x3942,m5,4
+	round_32_47	d_d,d_a,d_b,d_c,0x8771,0xf681,m8,11
+	round_32_47	d_c,d_d,d_a,d_b,0x6d9d,0x6122,m11,16
+	round_32_47	d_b,d_c,d_d,d_a,0xfde5,0x380c,m14,23
+	round_32_47	d_a,d_b,d_c,d_d,0xa4be,0xea44,m1,4
+	round_32_47	d_d,d_a,d_b,d_c,0x4bde,0xcfa9,m4,11
+	round_32_47	d_c,d_d,d_a,d_b,0xf6bb,0x4b60,m7,16
+	round_32_47	d_b,d_c,d_d,d_a,0xbebf,0xbc70,m10,23
+	round_32_47	d_a,d_b,d_c,d_d,0x289b,0x7ec6,m13,4
+	round_32_47	d_d,d_a,d_b,d_c,0xeaa1,0x27fa,m0,11
+	round_32_47	d_c,d_d,d_a,d_b,0xd4ef,0x3085,m3,16
+	round_32_47	d_b,d_c,d_d,d_a,0x488,0x1d05,m6,23
+	round_32_47	d_a,d_b,d_c,d_d,0xd9d4,0xd039,m9,4
+	round_32_47	d_d,d_a,d_b,d_c,0xe6db,0x99e5,m12,11
+	round_32_47	d_c,d_d,d_a,d_b,0x1fa2,0x7cf8,m15,16
+	round_32_47	d_b,d_c,d_d,d_a,0xc4ac,0x5665,m2,23
+
+	round_48_63	d_a,d_b,d_c,d_d,0xf429,0x2244,m0,6
+	round_48_63	d_d,d_a,d_b,d_c,0x432a,0xff97,m7,10
+	round_48_63	d_c,d_d,d_a,d_b,0xab94,0x23a7,m14,15
+	round_48_63	d_b,d_c,d_d,d_a,0xfc93,0xa039,m5,21
+	round_48_63	d_a,d_b,d_c,d_d,0x655b,0x59c3,m12,6
+	round_48_63	d_d,d_a,d_b,d_c,0x8f0c,0xcc92,m3,10
+	round_48_63	d_c,d_d,d_a,d_b,0xffef,0xf47d,m10,15
+	round_48_63	d_b,d_c,d_d,d_a,0x8584,0x5dd1,m1,21
+	round_48_63	d_a,d_b,d_c,d_d,0x6fa8,0x7e4f,m8,6
+	round_48_63	d_d,d_a,d_b,d_c,0xfe2c,0xe6e0,m15,10
+	round_48_63	d_c,d_d,d_a,d_b,0xa301,0x4314,m6,15
+	round_48_63	d_b,d_c,d_d,d_a,0x4e08,0x11a1,m13,21
+	round_48_63	d_a,d_b,d_c,d_d,0xf753,0x7e82,m4,6
+	ldp		d_a1,d_b1,[digest_addr]
+	round_48_63	d_d,d_a,d_b,d_c,0xbd3a,0xf235,m11,10
+	ldp		d_c1,d_d1,[digest_addr,8]
+	round_48_63	d_c,d_d,d_a,d_b,0x2ad7,0xd2bb,m2,15
+	round_48_63	d_b,d_c,d_d,d_a,0xeb86,0xd391,m9,21
+
+	cmp	buf_adr,end
+	add	d_a,d_a1 ,d_a
+	str	d_a,[digest_addr]
+	add	d_b,d_b1 ,d_b
+	str	d_b,[digest_addr,4]
+	add	d_c,d_c1 ,d_c
+	str	d_c,[digest_addr,8]
+	add	d_d,d_d1 ,d_d
+	str	d_d,[digest_addr,12]
+	bne	.loop_start
+
+.exit:
+	ldp	x19, x20, [sp, 16]
+	ldp	x21, x22, [sp, 32]
+	ldp	x23, x24, [sp, 48]
+	ldp	x25, x26, [sp, 64]
+	ldp	x27, x28, [sp, 80]
+	ldp	x29, x30, [sp], 96
+	ret
+	.size md5_mb_asimd_x1, .-md5_mb_asimd_x1
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x4.S b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x4.S
new file mode 100644
index 000000000..53979131d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x4.S
@@ -0,0 +1,526 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a
+
+/*
+Macros
+*/
+
+.macro	declare_var_vector_reg name:req,reg:req
+	q_\name	.req	q\reg
+	v_\name	.req	v\reg
+	s_\name	.req	s\reg
+.endm
+
+.macro	add_key_rol	a:req,b:req,k:req,w:req,r:req
+	add	v_tmp0.4s,v_\k\().4s,v_\w\().4s
+	add	v_tmp1.4s,v_tmp1.4s,v_\a\().4s
+	add	v_tmp1.4s,v_tmp1.4s,v_tmp0.4s
+	shl	v_tmp0.4s,v_tmp1.4s,\r
+	ushr	v_tmp1.4s,v_tmp1.4s,32-\r
+	orr	v_tmp0.16b,v_tmp1.16b,v_tmp0.16b
+
+	add	v_\a\().4s,v_\b\().4s,v_tmp0.4s
+.endm
+.macro	round_0_15	a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req
+	mov	v_tmp1.16b, v_\b\().16b
+	bsl	v_tmp1.16b, v_\c\().16b, v_\d\().16b
+	ldr	q_\k1,[key_adr],16
+	add_key_rol	\a,\b,\k,\w,\r
+.endm
+
+.macro	round_16_31	a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req
+	mov	v_tmp1.16b, v_\d\().16b
+	bsl	v_tmp1.16b, v_\b\().16b, v_\c\().16b
+	ldr	q_\k1,[key_adr],16
+	add_key_rol	\a,\b,\k,\w,\r
+.endm
+
+.macro	round_32_47	a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req
+	eor	v_tmp1.16b,v_\b\().16b,v_\c\().16b
+	eor	v_tmp1.16b,v_tmp1.16b,v_\d\().16b
+	ldr	q_\k1,[key_adr],16
+	add_key_rol	\a,\b,\k,\w,\r
+.endm
+
+.macro	round_48_63	a:req,b:req,c:req,d:req,k:req,k1,w:req,r:req
+	orn	v_tmp1.16b,v_\b\().16b,v_\d\().16b
+	eor	v_tmp1.16b,v_tmp1.16b,v_\c\().16b
+	.ifnb	\k1
+	ldr	q_\k1,[key_adr],16
+	.endif
+	add_key_rol	\a,\b,\k,\w,\r
+.endm
+/*
+	variables
+*/
+	declare_var_vector_reg	tmp0, 0
+	declare_var_vector_reg	tmp1, 1
+	declare_var_vector_reg	k,    2
+	declare_var_vector_reg	k1,   3
+	declare_var_vector_reg	a,    4
+	declare_var_vector_reg	b,    5
+	declare_var_vector_reg	c,    6
+	declare_var_vector_reg	d,    7
+	declare_var_vector_reg	a1,   8
+	declare_var_vector_reg	b1,   9
+	declare_var_vector_reg	c1,  10
+	declare_var_vector_reg	d1,  11
+
+	declare_var_vector_reg	w0,  16
+	declare_var_vector_reg	w1,  17
+	declare_var_vector_reg	w2,  18
+	declare_var_vector_reg	w3,  19
+	declare_var_vector_reg	w4,  20
+	declare_var_vector_reg	w5,  21
+	declare_var_vector_reg	w6,  22
+	declare_var_vector_reg	w7,  23
+	declare_var_vector_reg	w8,  24
+	declare_var_vector_reg	w9,  25
+	declare_var_vector_reg	w10, 26
+	declare_var_vector_reg	w11, 27
+	declare_var_vector_reg	w12, 28
+	declare_var_vector_reg	w13, 29
+	declare_var_vector_reg	w14, 30
+	declare_var_vector_reg	w15, 31
+
+	len	.req	w4
+	len_x	.req	x4
+	lane0	.req	x5
+	lane1	.req	x6
+	lane2	.req	x7
+	lane3	.req	x9
+	end	.req	x4
+	job0	.req	x0
+	job1	.req	x1
+	job2	.req	x2
+	job3	.req	x3
+	key_adr	.req	x10
+
+/*
+	void md5_mb_asimd_x4(MD5_JOB * job0, MD5_JOB * job1,
+		MD5_JOB * job2, MD5_JOB * job3, int len)
+*/
+	.global md5_mb_asimd_x4
+	.type md5_mb_asimd_x4, %function
+md5_mb_asimd_x4:
+	stp	x29,x30,[sp,-48]!
+	ldr	lane0,[job0],64
+	stp	d8,d9,[sp,16]
+	ldr	lane1,[job1],64
+	stp	d10,d11,[sp,32]
+	ldr	lane2,[job2],64
+	cmp	len,0
+	ldr	lane3,[job3],64
+	ble	.exit
+
+	//load digests
+	ld4	{v_a.s-v_d.s}[0],[job0]
+	add	end,lane0,len_x,lsl 6
+	ld4	{v_a.s-v_d.s}[1],[job1]
+	ld4	{v_a.s-v_d.s}[2],[job2]
+	ld4	{v_a.s-v_d.s}[3],[job3]
+.loop_start:
+	ld1	{v_w0.s}[0],[lane0],4
+	mov	v_a1.16b,v_a.16b
+	ld1	{v_w0.s}[1],[lane1],4
+	mov	v_b1.16b,v_b.16b
+	ld1	{v_w0.s}[2],[lane2],4
+	mov	v_c1.16b,v_c.16b
+	ld1	{v_w0.s}[3],[lane3],4
+	mov	v_d1.16b,v_d.16b
+
+	ld3	{v_w1.s-v_w3.s}[0],[lane0],12
+	adrp	key_adr,.key_consts
+	ld3	{v_w1.s-v_w3.s}[1],[lane1],12
+	add	key_adr,key_adr,#:lo12:.key_consts
+	ld3	{v_w1.s-v_w3.s}[2],[lane2],12
+	ldr	q_k,[key_adr],16
+	ld3	{v_w1.s-v_w3.s}[3],[lane3],12
+
+
+	ld4		{v_w4.s-v_w7.s}[0], [lane0],16
+
+	round_0_15	a,b,c,d,k,k1,w0,7
+
+	ld4		{v_w4.s-v_w7.s}[1], [lane1],16
+	round_0_15	d,a,b,c,k1,k,w1,12
+	ld4		{v_w4.s-v_w7.s}[2], [lane2],16
+	round_0_15	c,d,a,b,k,k1,w2,17
+	ld4		{v_w4.s-v_w7.s}[3], [lane3],16
+	round_0_15	b,c,d,a,k1,k,w3,22
+	ld4		{v_w8.s-v_w11.s}[0],[lane0],16
+	round_0_15	a,b,c,d,k,k1,w4,7
+	ld4		{v_w8.s-v_w11.s}[1],[lane1],16
+	round_0_15	d,a,b,c,k1,k,w5,12
+	ld4		{v_w8.s-v_w11.s}[2],[lane2],16
+	round_0_15	c,d,a,b,k,k1,w6,17
+	ld4		{v_w8.s-v_w11.s}[3],[lane3],16
+	round_0_15	b,c,d,a,k1,k,w7,22
+	ld4		{v_w12.s-v_w15.s}[0],[lane0],16
+	round_0_15	a,b,c,d,k,k1,w8,7
+	ld4		{v_w12.s-v_w15.s}[1],[lane1],16
+	round_0_15	d,a,b,c,k1,k,w9,12
+	ld4		{v_w12.s-v_w15.s}[2],[lane2],16
+	round_0_15	c,d,a,b,k,k1,w10,17
+	ld4		{v_w12.s-v_w15.s}[3],[lane3],16
+	round_0_15	b,c,d,a,k1,k,w11,22
+	round_0_15	a,b,c,d,k,k1,w12,7
+	round_0_15	d,a,b,c,k1,k,w13,12
+	round_0_15	c,d,a,b,k,k1,w14,17
+	round_0_15	b,c,d,a,k1,k,w15,22
+
+	round_16_31	a,b,c,d,k,k1,w1,5
+	round_16_31	d,a,b,c,k1,k,w6,9
+	round_16_31	c,d,a,b,k,k1,w11,14
+	round_16_31	b,c,d,a,k1,k,w0,20
+	round_16_31	a,b,c,d,k,k1,w5,5
+	round_16_31	d,a,b,c,k1,k,w10,9
+	round_16_31	c,d,a,b,k,k1,w15,14
+	round_16_31	b,c,d,a,k1,k,w4,20
+	round_16_31	a,b,c,d,k,k1,w9,5
+	round_16_31	d,a,b,c,k1,k,w14,9
+	round_16_31	c,d,a,b,k,k1,w3,14
+	round_16_31	b,c,d,a,k1,k,w8,20
+	round_16_31	a,b,c,d,k,k1,w13,5
+	round_16_31	d,a,b,c,k1,k,w2,9
+	round_16_31	c,d,a,b,k,k1,w7,14
+	round_16_31	b,c,d,a,k1,k,w12,20
+
+	round_32_47	a,b,c,d,k,k1,w5,4
+	round_32_47	d,a,b,c,k1,k,w8,11
+	round_32_47	c,d,a,b,k,k1,w11,16
+	round_32_47	b,c,d,a,k1,k,w14,23
+	round_32_47	a,b,c,d,k,k1,w1,4
+	round_32_47	d,a,b,c,k1,k,w4,11
+	round_32_47	c,d,a,b,k,k1,w7,16
+	round_32_47	b,c,d,a,k1,k,w10,23
+	round_32_47	a,b,c,d,k,k1,w13,4
+	round_32_47	d,a,b,c,k1,k,w0,11
+	round_32_47	c,d,a,b,k,k1,w3,16
+	round_32_47	b,c,d,a,k1,k,w6,23
+	round_32_47	a,b,c,d,k,k1,w9,4
+	round_32_47	d,a,b,c,k1,k,w12,11
+	round_32_47	c,d,a,b,k,k1,w15,16
+	round_32_47	b,c,d,a,k1,k,w2,23
+
+	round_48_63	a,b,c,d,k,k1,w0,6
+	round_48_63	d,a,b,c,k1,k,w7,10
+	round_48_63	c,d,a,b,k,k1,w14,15
+	round_48_63	b,c,d,a,k1,k,w5,21
+	round_48_63	a,b,c,d,k,k1,w12,6
+	round_48_63	d,a,b,c,k1,k,w3,10
+	round_48_63	c,d,a,b,k,k1,w10,15
+	round_48_63	b,c,d,a,k1,k,w1,21
+	round_48_63	a,b,c,d,k,k1,w8,6
+	round_48_63	d,a,b,c,k1,k,w15,10
+	round_48_63	c,d,a,b,k,k1,w6,15
+	round_48_63	b,c,d,a,k1,k,w13,21
+	round_48_63	a,b,c,d,k,k1,w4,6
+	round_48_63	d,a,b,c,k1,k,w11,10
+	round_48_63	c,d,a,b,k,k1,w2,15
+	round_48_63	b,c,d,a,k1, ,w9,21
+
+
+
+
+	cmp	lane0,end
+	add	v_a.4s,v_a1.4s,v_a.4s
+	add	v_b.4s,v_b1.4s,v_b.4s
+	add	v_c.4s,v_c1.4s,v_c.4s
+	add	v_d.4s,v_d1.4s,v_d.4s
+	bne	.loop_start
+
+	st4	{v_a.s-v_d.s}[0],[job0]
+	st4	{v_a.s-v_d.s}[1],[job1]
+	st4	{v_a.s-v_d.s}[2],[job2]
+	st4	{v_a.s-v_d.s}[3],[job3]
+.exit:
+	ldp	d8,d9,[sp,16]
+	ldp	d10,d11,[sp,32]
+	ldp	x29,x30,[sp],48
+	ret
+.key_consts:
+	.word	0xd76aa478
+	.word	0xd76aa478
+	.word	0xd76aa478
+	.word	0xd76aa478
+	.word	0xe8c7b756
+	.word	0xe8c7b756
+	.word	0xe8c7b756
+	.word	0xe8c7b756
+	.word	0x242070db
+	.word	0x242070db
+	.word	0x242070db
+	.word	0x242070db
+	.word	0xc1bdceee
+	.word	0xc1bdceee
+	.word	0xc1bdceee
+	.word	0xc1bdceee
+	.word	0xf57c0faf
+	.word	0xf57c0faf
+	.word	0xf57c0faf
+	.word	0xf57c0faf
+	.word	0x4787c62a
+	.word	0x4787c62a
+	.word	0x4787c62a
+	.word	0x4787c62a
+	.word	0xa8304613
+	.word	0xa8304613
+	.word	0xa8304613
+	.word	0xa8304613
+	.word	0xfd469501
+	.word	0xfd469501
+	.word	0xfd469501
+	.word	0xfd469501
+	.word	0x698098d8
+	.word	0x698098d8
+	.word	0x698098d8
+	.word	0x698098d8
+	.word	0x8b44f7af
+	.word	0x8b44f7af
+	.word	0x8b44f7af
+	.word	0x8b44f7af
+	.word	0xffff5bb1
+	.word	0xffff5bb1
+	.word	0xffff5bb1
+	.word	0xffff5bb1
+	.word	0x895cd7be
+	.word	0x895cd7be
+	.word	0x895cd7be
+	.word	0x895cd7be
+	.word	0x6b901122
+	.word	0x6b901122
+	.word	0x6b901122
+	.word	0x6b901122
+	.word	0xfd987193
+	.word	0xfd987193
+	.word	0xfd987193
+	.word	0xfd987193
+	.word	0xa679438e
+	.word	0xa679438e
+	.word	0xa679438e
+	.word	0xa679438e
+	.word	0x49b40821
+	.word	0x49b40821
+	.word	0x49b40821
+	.word	0x49b40821
+	.word	0xf61e2562
+	.word	0xf61e2562
+	.word	0xf61e2562
+	.word	0xf61e2562
+	.word	0xc040b340
+	.word	0xc040b340
+	.word	0xc040b340
+	.word	0xc040b340
+	.word	0x265e5a51
+	.word	0x265e5a51
+	.word	0x265e5a51
+	.word	0x265e5a51
+	.word	0xe9b6c7aa
+	.word	0xe9b6c7aa
+	.word	0xe9b6c7aa
+	.word	0xe9b6c7aa
+	.word	0xd62f105d
+	.word	0xd62f105d
+	.word	0xd62f105d
+	.word	0xd62f105d
+	.word	0x02441453
+	.word	0x02441453
+	.word	0x02441453
+	.word	0x02441453
+	.word	0xd8a1e681
+	.word	0xd8a1e681
+	.word	0xd8a1e681
+	.word	0xd8a1e681
+	.word	0xe7d3fbc8
+	.word	0xe7d3fbc8
+	.word	0xe7d3fbc8
+	.word	0xe7d3fbc8
+	.word	0x21e1cde6
+	.word	0x21e1cde6
+	.word	0x21e1cde6
+	.word	0x21e1cde6
+	.word	0xc33707d6
+	.word	0xc33707d6
+	.word	0xc33707d6
+	.word	0xc33707d6
+	.word	0xf4d50d87
+	.word	0xf4d50d87
+	.word	0xf4d50d87
+	.word	0xf4d50d87
+	.word	0x455a14ed
+	.word	0x455a14ed
+	.word	0x455a14ed
+	.word	0x455a14ed
+	.word	0xa9e3e905
+	.word	0xa9e3e905
+	.word	0xa9e3e905
+	.word	0xa9e3e905
+	.word	0xfcefa3f8
+	.word	0xfcefa3f8
+	.word	0xfcefa3f8
+	.word	0xfcefa3f8
+	.word	0x676f02d9
+	.word	0x676f02d9
+	.word	0x676f02d9
+	.word	0x676f02d9
+	.word	0x8d2a4c8a
+	.word	0x8d2a4c8a
+	.word	0x8d2a4c8a
+	.word	0x8d2a4c8a
+	.word	0xfffa3942
+	.word	0xfffa3942
+	.word	0xfffa3942
+	.word	0xfffa3942
+	.word	0x8771f681
+	.word	0x8771f681
+	.word	0x8771f681
+	.word	0x8771f681
+	.word	0x6d9d6122
+	.word	0x6d9d6122
+	.word	0x6d9d6122
+	.word	0x6d9d6122
+	.word	0xfde5380c
+	.word	0xfde5380c
+	.word	0xfde5380c
+	.word	0xfde5380c
+	.word	0xa4beea44
+	.word	0xa4beea44
+	.word	0xa4beea44
+	.word	0xa4beea44
+	.word	0x4bdecfa9
+	.word	0x4bdecfa9
+	.word	0x4bdecfa9
+	.word	0x4bdecfa9
+	.word	0xf6bb4b60
+	.word	0xf6bb4b60
+	.word	0xf6bb4b60
+	.word	0xf6bb4b60
+	.word	0xbebfbc70
+	.word	0xbebfbc70
+	.word	0xbebfbc70
+	.word	0xbebfbc70
+	.word	0x289b7ec6
+	.word	0x289b7ec6
+	.word	0x289b7ec6
+	.word	0x289b7ec6
+	.word	0xeaa127fa
+	.word	0xeaa127fa
+	.word	0xeaa127fa
+	.word	0xeaa127fa
+	.word	0xd4ef3085
+	.word	0xd4ef3085
+	.word	0xd4ef3085
+	.word	0xd4ef3085
+	.word	0x04881d05
+	.word	0x04881d05
+	.word	0x04881d05
+	.word	0x04881d05
+	.word	0xd9d4d039
+	.word	0xd9d4d039
+	.word	0xd9d4d039
+	.word	0xd9d4d039
+	.word	0xe6db99e5
+	.word	0xe6db99e5
+	.word	0xe6db99e5
+	.word	0xe6db99e5
+	.word	0x1fa27cf8
+	.word	0x1fa27cf8
+	.word	0x1fa27cf8
+	.word	0x1fa27cf8
+	.word	0xc4ac5665
+	.word	0xc4ac5665
+	.word	0xc4ac5665
+	.word	0xc4ac5665
+	.word	0xf4292244
+	.word	0xf4292244
+	.word	0xf4292244
+	.word	0xf4292244
+	.word	0x432aff97
+	.word	0x432aff97
+	.word	0x432aff97
+	.word	0x432aff97
+	.word	0xab9423a7
+	.word	0xab9423a7
+	.word	0xab9423a7
+	.word	0xab9423a7
+	.word	0xfc93a039
+	.word	0xfc93a039
+	.word	0xfc93a039
+	.word	0xfc93a039
+	.word	0x655b59c3
+	.word	0x655b59c3
+	.word	0x655b59c3
+	.word	0x655b59c3
+	.word	0x8f0ccc92
+	.word	0x8f0ccc92
+	.word	0x8f0ccc92
+	.word	0x8f0ccc92
+	.word	0xffeff47d
+	.word	0xffeff47d
+	.word	0xffeff47d
+	.word	0xffeff47d
+	.word	0x85845dd1
+	.word	0x85845dd1
+	.word	0x85845dd1
+	.word	0x85845dd1
+	.word	0x6fa87e4f
+	.word	0x6fa87e4f
+	.word	0x6fa87e4f
+	.word	0x6fa87e4f
+	.word	0xfe2ce6e0
+	.word	0xfe2ce6e0
+	.word	0xfe2ce6e0
+	.word	0xfe2ce6e0
+	.word	0xa3014314
+	.word	0xa3014314
+	.word	0xa3014314
+	.word	0xa3014314
+	.word	0x4e0811a1
+	.word	0x4e0811a1
+	.word	0x4e0811a1
+	.word	0x4e0811a1
+	.word	0xf7537e82
+	.word	0xf7537e82
+	.word	0xf7537e82
+	.word	0xf7537e82
+	.word	0xbd3af235
+	.word	0xbd3af235
+	.word	0xbd3af235
+	.word	0xbd3af235
+	.word	0x2ad7d2bb
+	.word	0x2ad7d2bb
+	.word	0x2ad7d2bb
+	.word	0x2ad7d2bb
+	.word	0xeb86d391
+	.word	0xeb86d391
+	.word	0xeb86d391
+	.word	0xeb86d391
+	.size md5_mb_asimd_x4, .-md5_mb_asimd_x4
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_mgr_aarch64_asimd.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_mgr_aarch64_asimd.c
new file mode 100644
index 000000000..5289cd91f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_mgr_aarch64_asimd.c
@@ -0,0 +1,187 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <md5_mb.h>
+#include <assert.h>
+
+#ifndef max
+#define max(a,b)            (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b)            (((a) < (b)) ? (a) : (b))
+#endif
+
+#define MD5_MB_CE_MAX_LANES	4
+void md5_mb_asimd_x4(MD5_JOB *, MD5_JOB *, MD5_JOB *, MD5_JOB *, int);
+void md5_mb_asimd_x1(MD5_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i)  	\
+	(((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i)  	\
+	(((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define	LANE_IS_FREE(state,i)		\
+	(((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i)	\
+	(((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+void md5_mb_mgr_init_asimd(MD5_MB_JOB_MGR * state)
+{
+	unsigned int i;
+
+	state->unused_lanes[0] = 0xf;
+	state->num_lanes_inuse = 0;
+	for (i = 0; i < MD5_MB_CE_MAX_LANES; i++) {
+		state->unused_lanes[0] <<= 4;
+		state->unused_lanes[0] |= MD5_MB_CE_MAX_LANES - 1 - i;
+		state->lens[i] = i;
+		state->ldata[i].job_in_lane = 0;
+	}
+
+	//lanes > MD5_MB_CE_MAX_LANES is invalid lane
+	for (; i < MD5_MAX_LANES; i++) {
+		state->lens[i] = 0xf;
+		state->ldata[i].job_in_lane = 0;
+	}
+}
+
+static int md5_mb_mgr_do_jobs(MD5_MB_JOB_MGR * state)
+{
+	int lane_idx, len, i;
+
+	if (state->num_lanes_inuse == 0) {
+		return -1;
+	}
+	if (state->num_lanes_inuse == 4) {
+		len = min(min(state->lens[0], state->lens[1]),
+			  min(state->lens[2], state->lens[3]));
+		lane_idx = len & 0xf;
+		len &= ~0xf;
+		md5_mb_asimd_x4(state->ldata[0].job_in_lane,
+				state->ldata[1].job_in_lane,
+				state->ldata[2].job_in_lane,
+				state->ldata[3].job_in_lane, len >> 4);
+		//only return the min length job
+		for (i = 0; i < MD5_MAX_LANES; i++) {
+			if (LANE_IS_NOT_FINISHED(state, i)) {
+				state->lens[i] -= len;
+				state->ldata[i].job_in_lane->len -= len;
+				state->ldata[i].job_in_lane->buffer += len << 2;
+			}
+		}
+
+		return lane_idx;
+	} else {
+		for (i = 0; i < MD5_MAX_LANES; i++) {
+			if (LANE_IS_NOT_FINISHED(state, i)) {
+				len = state->lens[i] & (~0xf);
+				md5_mb_asimd_x1(state->ldata[i].job_in_lane, len >> 4);
+				state->lens[i] -= len;
+				state->ldata[i].job_in_lane->len -= len;
+				state->ldata[i].job_in_lane->buffer += len << 2;
+				return i;
+			}
+		}
+	}
+	return -1;
+
+}
+
+static MD5_JOB *md5_mb_mgr_free_lane(MD5_MB_JOB_MGR * state)
+{
+	int i;
+	MD5_JOB *ret = NULL;
+
+	for (i = 0; i < MD5_MB_CE_MAX_LANES; i++) {
+		if (LANE_IS_FINISHED(state, i)) {
+
+			state->unused_lanes[0] <<= 4;
+			state->unused_lanes[0] |= i;
+			state->num_lanes_inuse--;
+			ret = state->ldata[i].job_in_lane;
+			ret->status = STS_COMPLETED;
+			state->ldata[i].job_in_lane = NULL;
+			break;
+		}
+	}
+	return ret;
+}
+
+static void md5_mb_mgr_insert_job(MD5_MB_JOB_MGR * state, MD5_JOB * job)
+{
+	int lane_idx;
+	//add job into lanes
+	lane_idx = state->unused_lanes[0] & 0xf;
+	//fatal error
+	assert(lane_idx < MD5_MB_CE_MAX_LANES);
+	state->lens[lane_idx] = (job->len << 4) | lane_idx;
+	state->ldata[lane_idx].job_in_lane = job;
+	state->unused_lanes[0] >>= 4;
+	state->num_lanes_inuse++;
+}
+
+MD5_JOB *md5_mb_mgr_submit_asimd(MD5_MB_JOB_MGR * state, MD5_JOB * job)
+{
+#ifndef NDEBUG
+	int lane_idx;
+#endif
+	MD5_JOB *ret;
+
+	//add job into lanes
+	md5_mb_mgr_insert_job(state, job);
+
+	ret = md5_mb_mgr_free_lane(state);
+	if (ret != NULL) {
+		return ret;
+	}
+	//submit will wait all lane has data
+	if (state->num_lanes_inuse < MD5_MB_CE_MAX_LANES)
+		return NULL;
+#ifndef NDEBUG
+	lane_idx = md5_mb_mgr_do_jobs(state);
+	assert(lane_idx != -1);
+#else
+	md5_mb_mgr_do_jobs(state);
+#endif
+
+	ret = md5_mb_mgr_free_lane(state);
+	return ret;
+}
+
+MD5_JOB *md5_mb_mgr_flush_asimd(MD5_MB_JOB_MGR * state)
+{
+	MD5_JOB *ret;
+	ret = md5_mb_mgr_free_lane(state);
+	if (ret) {
+		return ret;
+	}
+
+	md5_mb_mgr_do_jobs(state);
+	return md5_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_multibinary.S b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_multibinary.S
new file mode 100644
index 000000000..b66320f5c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_multibinary.S
@@ -0,0 +1,36 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include "aarch64_multibinary.h"
+
+
+mbin_interface md5_ctx_mgr_submit
+mbin_interface md5_ctx_mgr_init
+mbin_interface md5_ctx_mgr_flush
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c
new file mode 100644
index 000000000..ac03a6705
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c
@@ -0,0 +1,263 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx")
+#endif
+
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+static inline void hash_init_digest(MD5_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len);
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx);
+
+void md5_ctx_mgr_init_avx(MD5_HASH_CTX_MGR * mgr)
+{
+	md5_mb_mgr_init_avx(&mgr->mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_avx(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+				     const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return md5_ctx_mgr_resubmit(mgr, ctx);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_avx(MD5_HASH_CTX_MGR * mgr)
+{
+	MD5_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_avx(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = md5_ctx_mgr_resubmit(mgr, ctx);
+
+		// If md5_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx)
+{
+	while (ctx) {
+
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				//memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len);
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % MD5_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= MD5_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx(&mgr->mgr,
+									     &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+	static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+	    { MD5_INITIAL_DIGEST };
+	//memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest));
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (MD5_BLOCK_SIZE - 1));
+
+	// memset(&padblock[i], 0, MD5_BLOCK_SIZE);
+	memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 +
+	    MD5_PADLENGTHFIELD_SIZE;
+
+	*((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3);
+
+	return i >> MD5_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver md5_ctx_mgr_init_avx_slver_02020183;
+struct slver md5_ctx_mgr_init_avx_slver = { 0x0183, 0x02, 0x02 };
+
+struct slver md5_ctx_mgr_submit_avx_slver_02020184;
+struct slver md5_ctx_mgr_submit_avx_slver = { 0x0184, 0x02, 0x02 };
+
+struct slver md5_ctx_mgr_flush_avx_slver_02020185;
+struct slver md5_ctx_mgr_flush_avx_slver = { 0x0185, 0x02, 0x02 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c
new file mode 100644
index 000000000..cdc910c0d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c
@@ -0,0 +1,263 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+static inline void hash_init_digest(MD5_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len);
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx);
+
+void md5_ctx_mgr_init_avx2(MD5_HASH_CTX_MGR * mgr)
+{
+	md5_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_avx2(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+				      const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return md5_ctx_mgr_resubmit(mgr, ctx);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_avx2(MD5_HASH_CTX_MGR * mgr)
+{
+	MD5_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_avx2(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = md5_ctx_mgr_resubmit(mgr, ctx);
+
+		// If md5_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx)
+{
+	while (ctx) {
+
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				//memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len);
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % MD5_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= MD5_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx2(&mgr->mgr,
+									      &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+	static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+	    { MD5_INITIAL_DIGEST };
+	//memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest));
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (MD5_BLOCK_SIZE - 1));
+
+	// memset(&padblock[i], 0, MD5_BLOCK_SIZE);
+	memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 +
+	    MD5_PADLENGTHFIELD_SIZE;
+
+	*((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3);
+
+	return i >> MD5_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver md5_ctx_mgr_init_avx2_slver_04020186;
+struct slver md5_ctx_mgr_init_avx2_slver = { 0x0186, 0x02, 0x04 };
+
+struct slver md5_ctx_mgr_submit_avx2_slver_04020187;
+struct slver md5_ctx_mgr_submit_avx2_slver = { 0x0187, 0x02, 0x04 };
+
+struct slver md5_ctx_mgr_flush_avx2_slver_04020188;
+struct slver md5_ctx_mgr_flush_avx2_slver = { 0x0188, 0x02, 0x04 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c
new file mode 100644
index 000000000..682c2ed5e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c
@@ -0,0 +1,267 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(MD5_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len);
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx);
+
+void md5_ctx_mgr_init_avx512(MD5_HASH_CTX_MGR * mgr)
+{
+	md5_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_avx512(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+					const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return md5_ctx_mgr_resubmit(mgr, ctx);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_avx512(MD5_HASH_CTX_MGR * mgr)
+{
+	MD5_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_avx512(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = md5_ctx_mgr_resubmit(mgr, ctx);
+
+		// If md5_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx)
+{
+	while (ctx) {
+
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				//memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len);
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % MD5_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= MD5_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx512(&mgr->mgr,
+										&ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+	static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+	    { MD5_INITIAL_DIGEST };
+	//memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest));
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (MD5_BLOCK_SIZE - 1));
+
+	// memset(&padblock[i], 0, MD5_BLOCK_SIZE);
+	memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 +
+	    MD5_PADLENGTHFIELD_SIZE;
+
+	*((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3);
+
+	return i >> MD5_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver md5_ctx_mgr_init_avx512_slver_0600018c;
+struct slver md5_ctx_mgr_init_avx512_slver = { 0x018c, 0x00, 0x06 };
+
+struct slver md5_ctx_mgr_submit_avx512_slver_0600018d;
+struct slver md5_ctx_mgr_submit_avx512_slver = { 0x018d, 0x00, 0x06 };
+
+struct slver md5_ctx_mgr_flush_avx512_slver_0600018e;
+struct slver md5_ctx_mgr_flush_avx512_slver = { 0x018e, 0x00, 0x06 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
+
+#endif // HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base.c
new file mode 100644
index 000000000..c1d2a2738
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base.c
@@ -0,0 +1,291 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define F1(b,c,d) (d ^ (b & (c ^ d)))
+#define F2(b,c,d) (c ^ (d & (b ^ c)))
+#define F3(b,c,d) (b ^ c ^ d)
+#define F4(b,c,d) (c ^ (b | ~d))
+
+#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r))))
+
+#define step(i,a,b,c,d,f,k,w,r) \
+	if (i < 16) {f = F1(b,c,d); } else \
+	if (i < 32) {f = F2(b,c,d); } else \
+	if (i < 48) {f = F3(b,c,d); } else \
+				{f = F4(b,c,d); } \
+	f = a + f + k + to_le32(w); \
+	a = b + rol32(f, r);
+
+static void md5_init(MD5_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static uint32_t md5_update(MD5_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static void md5_final(MD5_HASH_CTX * ctx, uint32_t remain_len);
+static void OPT_FIX md5_single(const void *data, uint32_t digest[4]);
+static inline void hash_init_digest(MD5_WORD_T * digest);
+
+void md5_ctx_mgr_init_base(MD5_HASH_CTX_MGR * mgr)
+{
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_base(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+				      const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	uint32_t remain_len;
+
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) {
+		// Cannot submit a new entire job to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags == HASH_FIRST) {
+
+		md5_init(ctx, buffer, len);
+		md5_update(ctx, buffer, len);
+	}
+
+	if (flags == HASH_UPDATE) {
+		md5_update(ctx, buffer, len);
+	}
+
+	if (flags == HASH_LAST) {
+		remain_len = md5_update(ctx, buffer, len);
+		md5_final(ctx, remain_len);
+	}
+
+	if (flags == HASH_ENTIRE) {
+		md5_init(ctx, buffer, len);
+		remain_len = md5_update(ctx, buffer, len);
+		md5_final(ctx, remain_len);
+	}
+
+	return ctx;
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_base(MD5_HASH_CTX_MGR * mgr)
+{
+	return NULL;
+}
+
+static void md5_init(MD5_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+	// Init digest
+	hash_init_digest(ctx->job.result_digest);
+
+	// Reset byte counter
+	ctx->total_length = 0;
+
+	// Clear extra blocks
+	ctx->partial_block_buffer_length = 0;
+
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Mark it as processing
+	ctx->status = HASH_CTX_STS_PROCESSING;
+}
+
+static uint32_t md5_update(MD5_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+	uint32_t remain_len = len;
+	uint32_t *digest = ctx->job.result_digest;
+	while (remain_len >= 64) {
+		md5_single(buffer, digest);
+		buffer = (void *)((uint8_t *) buffer + 64);
+		remain_len -= 64;
+		ctx->total_length += 64;
+	}
+
+	ctx->status = HASH_CTX_STS_IDLE;
+	ctx->incoming_buffer = buffer;
+	return remain_len;
+}
+
+static void md5_final(MD5_HASH_CTX * ctx, uint32_t remain_len)
+{
+	const void *buffer = ctx->incoming_buffer;
+	uint32_t i = remain_len, j;
+	uint8_t buf[128];
+	uint32_t *digest = ctx->job.result_digest;
+
+	ctx->total_length += i;
+	memcpy(buf, buffer, i);
+	buf[i++] = 0x80;
+	for (j = i; j < 120; j++)
+		buf[j] = 0;
+
+	if (i > 64 - 8)
+		i = 128;
+	else
+		i = 64;
+
+	*(uint64_t *) (buf + i - 8) = to_le64((uint64_t) ctx->total_length * 8);
+
+	md5_single(buf, digest);
+	if (i == 128) {
+		md5_single(buf + 64, digest);
+	}
+
+	ctx->status = HASH_CTX_STS_COMPLETE;
+}
+
+static void md5_single(const void *data, uint32_t digest[4])
+{
+
+	uint32_t a, b, c, d;
+	uint32_t f;
+	uint32_t *w = (uint32_t *) data;
+
+	a = digest[0];
+	b = digest[1];
+	c = digest[2];
+	d = digest[3];
+
+	step(0, a, b, c, d, f, 0xd76aa478, w[0], 7);
+	step(1, d, a, b, c, f, 0xe8c7b756, w[1], 12);
+	step(2, c, d, a, b, f, 0x242070db, w[2], 17);
+	step(3, b, c, d, a, f, 0xc1bdceee, w[3], 22);
+	step(4, a, b, c, d, f, 0xf57c0faf, w[4], 7);
+	step(5, d, a, b, c, f, 0x4787c62a, w[5], 12);
+	step(6, c, d, a, b, f, 0xa8304613, w[6], 17);
+	step(7, b, c, d, a, f, 0xfd469501, w[7], 22);
+	step(8, a, b, c, d, f, 0x698098d8, w[8], 7);
+	step(9, d, a, b, c, f, 0x8b44f7af, w[9], 12);
+	step(10, c, d, a, b, f, 0xffff5bb1, w[10], 17);
+	step(11, b, c, d, a, f, 0x895cd7be, w[11], 22);
+	step(12, a, b, c, d, f, 0x6b901122, w[12], 7);
+	step(13, d, a, b, c, f, 0xfd987193, w[13], 12);
+	step(14, c, d, a, b, f, 0xa679438e, w[14], 17);
+	step(15, b, c, d, a, f, 0x49b40821, w[15], 22);
+
+	step(16, a, b, c, d, f, 0xf61e2562, w[1], 5);
+	step(17, d, a, b, c, f, 0xc040b340, w[6], 9);
+	step(18, c, d, a, b, f, 0x265e5a51, w[11], 14);
+	step(19, b, c, d, a, f, 0xe9b6c7aa, w[0], 20);
+	step(20, a, b, c, d, f, 0xd62f105d, w[5], 5);
+	step(21, d, a, b, c, f, 0x02441453, w[10], 9);
+	step(22, c, d, a, b, f, 0xd8a1e681, w[15], 14);
+	step(23, b, c, d, a, f, 0xe7d3fbc8, w[4], 20);
+	step(24, a, b, c, d, f, 0x21e1cde6, w[9], 5);
+	step(25, d, a, b, c, f, 0xc33707d6, w[14], 9);
+	step(26, c, d, a, b, f, 0xf4d50d87, w[3], 14);
+	step(27, b, c, d, a, f, 0x455a14ed, w[8], 20);
+	step(28, a, b, c, d, f, 0xa9e3e905, w[13], 5);
+	step(29, d, a, b, c, f, 0xfcefa3f8, w[2], 9);
+	step(30, c, d, a, b, f, 0x676f02d9, w[7], 14);
+	step(31, b, c, d, a, f, 0x8d2a4c8a, w[12], 20);
+
+	step(32, a, b, c, d, f, 0xfffa3942, w[5], 4);
+	step(33, d, a, b, c, f, 0x8771f681, w[8], 11);
+	step(34, c, d, a, b, f, 0x6d9d6122, w[11], 16);
+	step(35, b, c, d, a, f, 0xfde5380c, w[14], 23);
+	step(36, a, b, c, d, f, 0xa4beea44, w[1], 4);
+	step(37, d, a, b, c, f, 0x4bdecfa9, w[4], 11);
+	step(38, c, d, a, b, f, 0xf6bb4b60, w[7], 16);
+	step(39, b, c, d, a, f, 0xbebfbc70, w[10], 23);
+	step(40, a, b, c, d, f, 0x289b7ec6, w[13], 4);
+	step(41, d, a, b, c, f, 0xeaa127fa, w[0], 11);
+	step(42, c, d, a, b, f, 0xd4ef3085, w[3], 16);
+	step(43, b, c, d, a, f, 0x04881d05, w[6], 23);
+	step(44, a, b, c, d, f, 0xd9d4d039, w[9], 4);
+	step(45, d, a, b, c, f, 0xe6db99e5, w[12], 11);
+	step(46, c, d, a, b, f, 0x1fa27cf8, w[15], 16);
+	step(47, b, c, d, a, f, 0xc4ac5665, w[2], 23);
+
+	step(48, a, b, c, d, f, 0xf4292244, w[0], 6);
+	step(49, d, a, b, c, f, 0x432aff97, w[7], 10);
+	step(50, c, d, a, b, f, 0xab9423a7, w[14], 15);
+	step(51, b, c, d, a, f, 0xfc93a039, w[5], 21);
+	step(52, a, b, c, d, f, 0x655b59c3, w[12], 6);
+	step(53, d, a, b, c, f, 0x8f0ccc92, w[3], 10);
+	step(54, c, d, a, b, f, 0xffeff47d, w[10], 15);
+	step(55, b, c, d, a, f, 0x85845dd1, w[1], 21);
+	step(56, a, b, c, d, f, 0x6fa87e4f, w[8], 6);
+	step(57, d, a, b, c, f, 0xfe2ce6e0, w[15], 10);
+	step(58, c, d, a, b, f, 0xa3014314, w[6], 15);
+	step(59, b, c, d, a, f, 0x4e0811a1, w[13], 21);
+	step(60, a, b, c, d, f, 0xf7537e82, w[4], 6);
+	step(61, d, a, b, c, f, 0xbd3af235, w[11], 10);
+	step(62, c, d, a, b, f, 0x2ad7d2bb, w[2], 15);
+	step(63, b, c, d, a, f, 0xeb86d391, w[9], 21);
+
+	digest[0] += a;
+	digest[1] += b;
+	digest[2] += c;
+	digest[3] += d;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+	static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+	    { MD5_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver md5_ctx_mgr_init_base_slver_0000018f;
+struct slver md5_ctx_mgr_init_base_slver = { 0x018f, 0x00, 0x00 };
+
+struct slver md5_ctx_mgr_submit_base_slver_00000190;
+struct slver md5_ctx_mgr_submit_base_slver = { 0x0190, 0x00, 0x00 };
+
+struct slver md5_ctx_mgr_flush_base_slver_00000191;
+struct slver md5_ctx_mgr_flush_base_slver = { 0x0191, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base_aliases.c
new file mode 100644
index 000000000..42e29ab5a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base_aliases.c
@@ -0,0 +1,50 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <string.h>
+#include "md5_mb.h"
+extern void md5_ctx_mgr_init_base(MD5_HASH_CTX_MGR * mgr);
+extern MD5_HASH_CTX *md5_ctx_mgr_flush_base(MD5_HASH_CTX_MGR * mgr);
+extern MD5_HASH_CTX *md5_ctx_mgr_submit_base(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+					     const void *buffer, uint32_t len,
+					     HASH_CTX_FLAG flags);
+void md5_ctx_mgr_init(MD5_HASH_CTX_MGR * mgr)
+{
+	md5_ctx_mgr_init_base(mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush(MD5_HASH_CTX_MGR * mgr)
+{
+	return md5_ctx_mgr_flush_base(mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+				 const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	return md5_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c
new file mode 100644
index 000000000..1e7e91916
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c
@@ -0,0 +1,249 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+static inline void hash_init_digest(MD5_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len);
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx);
+
+void md5_ctx_mgr_init_sse(MD5_HASH_CTX_MGR * mgr)
+{
+	md5_mb_mgr_init_sse(&mgr->mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_sse(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+				     const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return md5_ctx_mgr_resubmit(mgr, ctx);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_sse(MD5_HASH_CTX_MGR * mgr)
+{
+	MD5_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_sse(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = md5_ctx_mgr_resubmit(mgr, ctx);
+
+		// If md5_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx)
+{
+	while (ctx) {
+
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				//memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len);
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % MD5_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= MD5_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_sse(&mgr->mgr,
+									     &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+	static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+	    { MD5_INITIAL_DIGEST };
+	//memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest));
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (MD5_BLOCK_SIZE - 1));
+
+	// memset(&padblock[i], 0, MD5_BLOCK_SIZE);
+	memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 +
+	    MD5_PADLENGTHFIELD_SIZE;
+
+	*((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3);
+
+	return i >> MD5_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver md5_ctx_mgr_init_sse_slver_00020180;
+struct slver md5_ctx_mgr_init_sse_slver = { 0x0180, 0x02, 0x00 };
+
+struct slver md5_ctx_mgr_submit_sse_slver_00020181;
+struct slver md5_ctx_mgr_submit_sse_slver = { 0x0181, 0x02, 0x00 };
+
+struct slver md5_ctx_mgr_flush_sse_slver_00020182;
+struct slver md5_ctx_mgr_flush_sse_slver = { 0x0182, 0x02, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm
new file mode 100644
index 000000000..7719946f0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm
@@ -0,0 +1,55 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN             0
+%define STS_BEING_PROCESSED     1
+%define STS_COMPLETED           2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define JOB_MD5 structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; JOB_MD5
+
+;;;     name                            size    align
+FIELD   _buffer,                        8,      8       ; pointer to buffer
+FIELD   _len,                           4,      4       ; length in bytes
+FIELD   _result_digest,                 4*4,    64      ; Digest (output)
+FIELD   _status,                        4,      4
+FIELD   _user_data,                     8,      8
+END_FIELDS
+
+%assign _JOB_MD5_size  _FIELD_OFFSET
+%assign _JOB_MD5_align _STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..6caad6733
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm
@@ -0,0 +1,73 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define MD5 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; LANE_DATA
+;;;     name            size    align
+FIELD   _job_in_lane,   8,      8       ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size 	_FIELD_OFFSET
+%assign _LANE_DATA_align	_STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; MD5_ARGS_X32
+;;;     name            size    align
+FIELD   _digest,        4*4*32, 16      ; transposed digest
+FIELD   _data_ptr,      8*32,   8       ; array of pointers to data
+END_FIELDS
+
+%assign _MD5_ARGS_X8_size       _FIELD_OFFSET
+%assign _MD5_ARGS_X8_align      _STRUCT_ALIGN
+%assign _MD5_ARGS_X16_size	_FIELD_OFFSET
+%assign _MD5_ARGS_X16_align	_STRUCT_ALIGN
+%assign _MD5_ARGS_X32_size	_FIELD_OFFSET
+%assign _MD5_ARGS_X32_align	_STRUCT_ALIGN
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; MB_MGR
+;;;     name            size    align
+FIELD   _args,          _MD5_ARGS_X8_size, _MD5_ARGS_X8_align
+FIELD   _lens,          4*32,   8
+FIELD   _unused_lanes,  8*4,    8
+FIELD   _ldata,         _LANE_DATA_size*32, _LANE_DATA_align
+FIELD   _num_lanes_inuse, 4,    4
+END_FIELDS
+
+%assign _MB_MGR_size    _FIELD_OFFSET
+%assign _MB_MGR_align   _STRUCT_ALIGN
+
+_args_digest    equ     _args + _digest
+_args_data_ptr  equ     _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm
new file mode 100644
index 000000000..b74646de4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm
@@ -0,0 +1,248 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x4x2_avx
+
+[bits 64]
+default rel
+section .text
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; UN*X register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state   arg1
+%define len2    arg2
+
+; idx must be a register not clobberred by md5_mb_x4x2_avx
+%define idx             r8
+
+%define unused_lanes    r9
+
+%define lane_data       r10
+
+%define job_rax         rax
+%define tmp             rax
+
+%endif ;; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*8
+_ALIGN_SIZE     equ 8
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; JOB* md5_mb_mgr_flush_avx(MB_MGR_HMAC_OOO *state)
+; arg 1 : rcx : state
+mk_global md5_mb_mgr_flush_avx, function
+md5_mb_mgr_flush_avx:
+	endbranch
+        sub     rsp, STACK_SPACE
+        mov     [rsp + _GPR_SAVE + 8*0], rbx
+        mov     [rsp + _GPR_SAVE + 8*3], rbp
+        mov     [rsp + _GPR_SAVE + 8*4], r12
+        mov     [rsp + _GPR_SAVE + 8*5], r13
+        mov     [rsp + _GPR_SAVE + 8*6], r14
+        mov     [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+        mov     [rsp + _GPR_SAVE + 8*1], rsi
+        mov     [rsp + _GPR_SAVE + 8*2], rdi
+        vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+        vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+        vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+        vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+        vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+        vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+        vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+        vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+        vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+        vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	; if bit (32+3) is set, then all lanes are empty
+	mov	unused_lanes, [state + _unused_lanes]
+	bt	unused_lanes, 32+3
+	jc	return_null
+
+	; find a lane with a non-null job
+	xor	idx, idx
+	cmp	qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [one]
+	cmp	qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [two]
+	cmp	qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [three]
+	cmp	qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [four]
+	cmp	qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [five]
+	cmp	qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [six]
+	cmp	qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [seven]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov	tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _args + _data_ptr + 8*I], tmp
+	mov 	dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	vmovdqa xmm0, [state + _lens + 0*16]
+        vmovdqa xmm1, [state + _lens + 1*16]
+
+        vpminud xmm2, xmm0, xmm1        ; xmm2 has {D,C,B,A}
+        vpalignr xmm3, xmm3, xmm2, 8    ; xmm3 has {x,x,D,C}
+        vpminud xmm2, xmm2, xmm3        ; xmm2 has {x,x,E,F}
+        vpalignr xmm3, xmm3, xmm2, 4    ; xmm3 has {x,x,x,E}
+        vpminud xmm2, xmm2, xmm3        ; xmm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+ 	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+	vpand   xmm2, xmm2, [rel clear_low_nibble]
+        vpshufd xmm2, xmm2, 0
+
+        vpsubd  xmm0, xmm0, xmm2
+        vpsubd  xmm1, xmm1, xmm2
+
+        vmovdqa [state + _lens + 0*16], xmm0
+        vmovdqa [state + _lens + 1*16], xmm1
+
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	md5_mb_x4x2_avx
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	mov	unused_lanes, [state + _unused_lanes]
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+	mov	dword [state + _lens + 4*idx], 0xFFFFFFFF
+	sub     dword [state + _num_lanes_inuse], 1
+
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*32]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*32], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*32], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*32], 3
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+        vmovdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+        vmovdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+        vmovdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+        vmovdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+        vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+        vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+        vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+        vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+        vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+        vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+        mov     rsi, [rsp + _GPR_SAVE + 8*1]
+        mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+        mov     rbx, [rsp + _GPR_SAVE + 8*0]
+        mov     rbp, [rsp + _GPR_SAVE + 8*3]
+        mov     r12, [rsp + _GPR_SAVE + 8*4]
+        mov     r13, [rsp + _GPR_SAVE + 8*5]
+        mov     r14, [rsp + _GPR_SAVE + 8*6]
+        mov     r15, [rsp + _GPR_SAVE + 8*7]
+        add     rsp, STACK_SPACE
+
+        ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+one:	dq  1
+two:	dq  2
+three:	dq  3
+four:	dq  4
+five:	dq  5
+six:	dq  6
+seven:	dq  7
+
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..910d5af89
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm
@@ -0,0 +1,255 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x8x2_avx2
+
+[bits 64]
+default rel
+section .text
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; UN*X register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state   arg1
+%define len2    arg2
+
+; idx must be a register not clobberred by md5_mb_x8x2_avx2
+%define idx             rbp
+
+%define unused_lanes    r9
+
+%define lane_data       r10
+
+%define job_rax         rax
+%define tmp             rax
+
+%define num_lanes_inuse r8
+
+%endif ;; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*8
+_ALIGN_SIZE     equ 8
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; JOB* md5_mb_mgr_flush_avx2(MB_MGR_HMAC_OOO *state)
+; arg 1 : rcx : state
+mk_global md5_mb_mgr_flush_avx2, function
+md5_mb_mgr_flush_avx2:
+	endbranch
+        sub     rsp, STACK_SPACE
+        mov     [rsp + _GPR_SAVE + 8*0], rbx
+        mov     [rsp + _GPR_SAVE + 8*3], rbp
+        mov     [rsp + _GPR_SAVE + 8*4], r12
+        mov     [rsp + _GPR_SAVE + 8*5], r13
+        mov     [rsp + _GPR_SAVE + 8*6], r14
+        mov     [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+        mov     [rsp + _GPR_SAVE + 8*1], rsi
+        mov     [rsp + _GPR_SAVE + 8*2], rdi
+        vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+        vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+        vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+        vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+        vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+        vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+        vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+        vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+        vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+        vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	cmp	num_lanes_inuse, 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor	idx, idx
+%assign I 1
+%rep 15
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov	tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _args + _data_ptr + 8*I], tmp
+	mov 	dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	vmovdqu ymm0, [state + _lens + 0*32]
+        vmovdqu ymm1, [state + _lens + 1*32]
+
+        vpminud ymm2, ymm0, ymm1        ; ymm2 has {D,C,B,A}
+        vpalignr ymm3, ymm3, ymm2, 8    ; ymm3 has {x,x,D,C}
+        vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x,E,F}
+        vpalignr ymm3, ymm3, ymm2, 4    ; ymm3 has {x,x,x,E}
+        vpminud ymm2, ymm2, ymm3        ; ymm2 has min value in low dword
+	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has halves of ymm2 reversed
+        vpminud ymm2, ymm2, ymm3        ; ymm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+ 	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+	vpand   ymm2, ymm2, [rel clear_low_nibble]
+        vpshufd ymm2, ymm2, 0
+
+        vpsubd  ymm0, ymm0, ymm2
+        vpsubd  ymm1, ymm1, ymm2
+
+        vmovdqu [state + _lens + 0*32], ymm0
+        vmovdqu [state + _lens + 1*32], ymm1
+
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	md5_mb_x8x2_avx2
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	unused_lanes, [state + _unused_lanes]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+        mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        sub     num_lanes_inuse, 1
+        mov     [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+	mov	dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*64]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*64], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*64], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*64], 3
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+        vmovdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+        vmovdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+        vmovdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+        vmovdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+        vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+        vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+        vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+        vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+        vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+        vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+        mov     rsi, [rsp + _GPR_SAVE + 8*1]
+        mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+        mov     rbx, [rsp + _GPR_SAVE + 8*0]
+        mov     rbp, [rsp + _GPR_SAVE + 8*3]
+        mov     r12, [rsp + _GPR_SAVE + 8*4]
+        mov     r13, [rsp + _GPR_SAVE + 8*5]
+        mov     r14, [rsp + _GPR_SAVE + 8*6]
+        mov     r15, [rsp + _GPR_SAVE + 8*7]
+        add     rsp, STACK_SPACE
+
+        ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1:     dq  1
+lane_2:     dq  2
+lane_3:     dq  3
+lane_4:     dq  4
+lane_5:     dq  5
+lane_6:     dq  6
+lane_7:     dq  7
+lane_8:     dq  8
+lane_9:     dq  9
+lane_10:    dq  10
+lane_11:    dq  11
+lane_12:    dq  12
+lane_13:    dq  13
+lane_14:    dq  14
+lane_15:    dq  15
+
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..a0eaf428a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm
@@ -0,0 +1,315 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+extern md5_mb_x16x2_avx512
+
+[bits 64]
+default rel
+section .text
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; UN*X register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state   arg1
+%define len2    arg2
+
+; idx must be a register not clobberred by md5_mb_x16_avx512
+%define idx	rbp
+
+%define unused_lanes    ymm7
+%define lane		r9
+
+%define lane_data	r10
+
+%define job_rax		rax
+%define tmp		rax
+
+%define num_lanes_inuse r8
+
+%endif ;; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*8
+_ALIGN_SIZE     equ 8
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+;; Byte shift in MEM addr, read a extra byte [addr+16]
+%macro MEM_VPSRLDDQ 2
+%define %%addr	  %1
+%define %%TMP_YMM %2
+	vmovdqu	%%TMP_YMM, [%%addr + 1]
+	vmovdqu [%%addr], %%TMP_YMM
+	mov	[%%addr + 31], byte 0
+%endmacro
+
+;; Byte shift in MEM addr, read a extra byte [addr-1]
+%macro MEM_VPSLLDDQ 2
+%define %%addr	  %1
+%define %%TMP_YMM %2
+	vmovdqu	%%TMP_YMM, [%%addr-1]
+	vmovdqu [%%addr], %%TMP_YMM
+	mov	[%%addr], byte 0
+%endmacro
+
+align 64
+
+; JOB* md5_mb_mgr_flush_avx512(MB_MGR_HMAC_OOO *state)
+; arg 1 : rcx : state
+mk_global md5_mb_mgr_flush_avx512, function
+md5_mb_mgr_flush_avx512:
+	endbranch
+	sub	rsp, STACK_SPACE
+	mov	[rsp + _GPR_SAVE + 8*0], rbx
+	mov	[rsp + _GPR_SAVE + 8*3], rbp
+	mov	[rsp + _GPR_SAVE + 8*4], r12
+	mov	[rsp + _GPR_SAVE + 8*5], r13
+	mov	[rsp + _GPR_SAVE + 8*6], r14
+	mov	[rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov	[rsp + _GPR_SAVE + 8*1], rsi
+	mov	[rsp + _GPR_SAVE + 8*2], rdi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	cmp	num_lanes_inuse, 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor	idx, idx
+%assign I 1
+%rep 31
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov	tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 32
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _args + _data_ptr + 8*I], tmp
+	mov	dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	vmovdqu ymm0, [state + _lens + 0*32]
+	vmovdqu ymm1, [state + _lens + 1*32]
+
+	vpminud ymm2, ymm0, ymm1	; ymm2 has {D,C,B,A}
+	vpalignr ymm3, ymm3, ymm2, 8    ; ymm3 has {x,x,D,C}
+	vpminud ymm2, ymm2, ymm3	; ymm2 has {x,x,E,F}
+	vpalignr ymm3, ymm3, ymm2, 4    ; ymm3 has {x,x,x,E}
+	vpminud ymm2, ymm2, ymm3	; ymm2 has min value in low dword
+	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has halves of ymm2 reversed
+	vpminud ymm2, ymm2, ymm3	; ymm2 has min value in low dword
+	; Find min length
+	vmovdqu ymm5, [state + _lens + 2*32]
+	vmovdqu ymm6, [state + _lens + 3*32]
+
+	vpminud ymm4, ymm5, ymm6	; ymm4 has {D,C,B,A}
+	vpalignr ymm3, ymm3, ymm4, 8    ; ymm3 has {x,x,D,C}
+	vpminud ymm4, ymm4, ymm3	; ymm4 has {x,x,E,F}
+	vpalignr ymm3, ymm3, ymm4, 4    ; ymm3 has {x,x,x,E}
+	vpminud ymm4, ymm4, ymm3	; ymm4 has min value in low dword
+	vperm2i128 ymm3, ymm4, ymm4, 1	; ymm3 has halves of ymm4 reversed
+	vpminud ymm4, ymm4, ymm3	; ymm4 has min value in low dword
+
+	vpminud ymm2, ymm2, ymm4	; ymm2 has min value in low dword
+	vmovd	DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0x3F
+	shr	len2, 6
+	jz	len_is_0
+
+	vpand	ymm2, ymm2, [rel clear_low_6bits]
+	vpshufd ymm2, ymm2, 0
+
+	vpsubd  ymm0, ymm0, ymm2
+	vpsubd  ymm1, ymm1, ymm2
+	vpsubd  ymm5, ymm5, ymm2
+	vpsubd  ymm6, ymm6, ymm2
+
+	vmovdqu [state + _lens + 0*32], ymm0
+	vmovdqu [state + _lens + 1*32], ymm1
+	vmovdqu [state + _lens + 2*32], ymm5
+	vmovdqu [state + _lens + 3*32], ymm6
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	md5_mb_x16x2_avx512
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	lane, [state + _unused_lanes]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+
+	shl	lane, 8
+	or	lane, idx
+	MEM_VPSLLDDQ    (state + _unused_lanes), unused_lanes
+	mov	[state + _unused_lanes], lane
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	sub	num_lanes_inuse, 1
+	mov	[state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+	mov	dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*4*16*2]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*4*16*2], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*4*16*2], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*4*16*2], 3
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov	rsi, [rsp + _GPR_SAVE + 8*1]
+	mov	rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov	rbx, [rsp + _GPR_SAVE + 8*0]
+	mov	rbp, [rsp + _GPR_SAVE + 8*3]
+	mov	r12, [rsp + _GPR_SAVE + 8*4]
+	mov	r13, [rsp + _GPR_SAVE + 8*5]
+	mov	r14, [rsp + _GPR_SAVE + 8*6]
+	mov	r15, [rsp + _GPR_SAVE + 8*7]
+	add	rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+
+section .data align=16
+
+align 16
+clear_low_6bits:
+	dq 0x00000000FFFFFFC0, 0x0000000000000000
+	dq 0x00000000FFFFFFC0, 0x0000000000000000
+lane_1:     dq  1
+lane_2:     dq  2
+lane_3:     dq  3
+lane_4:     dq  4
+lane_5:     dq  5
+lane_6:     dq  6
+lane_7:     dq  7
+lane_8:     dq  8
+lane_9:     dq  9
+lane_10:    dq  10
+lane_11:    dq  11
+lane_12:    dq  12
+lane_13:    dq  13
+lane_14:    dq  14
+lane_15:    dq  15
+lane_16:    dq  16
+lane_17:    dq  17
+lane_18:    dq  18
+lane_19:    dq  19
+lane_20:    dq  20
+lane_21:    dq  21
+lane_22:    dq  22
+lane_23:    dq  23
+lane_24:    dq  24
+lane_25:    dq  25
+lane_26:    dq  26
+lane_27:    dq  27
+lane_28:    dq  28
+lane_29:    dq  29
+lane_30:    dq  30
+lane_31:    dq  31
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_md5_mb_mgr_flush_avx512
+no_md5_mb_mgr_flush_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm
new file mode 100644
index 000000000..d3aa25f86
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm
@@ -0,0 +1,249 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x4x2_sse
+
+[bits 64]
+default rel
+section .text
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; UN*X register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state   arg1
+%define len2    arg2
+
+; idx must be a register not clobberred by md5_mb_x4x2_sse
+%define idx             r8
+
+%define unused_lanes    r9
+
+%define lane_data       r10
+
+%define job_rax         rax
+%define tmp             rax
+
+%endif ;; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*8
+_ALIGN_SIZE     equ 8
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; JOB* md5_mb_mgr_flush_sse(MB_MGR_HMAC_OOO *state)
+; arg 1 : rcx : state
+mk_global md5_mb_mgr_flush_sse, function
+md5_mb_mgr_flush_sse:
+	endbranch
+        sub     rsp, STACK_SPACE
+        mov     [rsp + _GPR_SAVE + 8*0], rbx
+        mov     [rsp + _GPR_SAVE + 8*3], rbp
+        mov     [rsp + _GPR_SAVE + 8*4], r12
+        mov     [rsp + _GPR_SAVE + 8*5], r13
+        mov     [rsp + _GPR_SAVE + 8*6], r14
+        mov     [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+        mov     [rsp + _GPR_SAVE + 8*1], rsi
+        mov     [rsp + _GPR_SAVE + 8*2], rdi
+        movdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+        movdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+        movdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+        movdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+        movdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+        movdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+        movdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+        movdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+        movdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+        movdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	; if bit (32+3) is set, then all lanes are empty
+	mov	unused_lanes, [state + _unused_lanes]
+	bt	unused_lanes, 32+3
+	jc	return_null
+
+	; find a lane with a non-null job
+	xor	idx, idx
+	cmp	qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [one]
+	cmp	qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [two]
+	cmp	qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [three]
+	cmp	qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [four]
+	cmp	qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [five]
+	cmp	qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [six]
+	cmp	qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [seven]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov	tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _args + _data_ptr + 8*I], tmp
+	mov 	dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	movdqa	xmm0, [state + _lens + 0*16]
+        movdqa	xmm1, [state + _lens + 1*16]
+
+	movdqa	xmm2, xmm0
+        pminud 	xmm2, xmm1        ; xmm2 has {D,C,B,A}
+        palignr xmm3, xmm2, 8     ; xmm3 has {x,x,D,C}
+        pminud	xmm2, xmm3        ; xmm2 has {x,x,E,F}
+        palignr xmm3, xmm2, 4     ; xmm3 has {x,x,x,E}
+        pminud	xmm2, xmm3        ; xmm2 has min value in low dword
+
+	movd   	DWORD(idx), xmm2
+ 	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+	pand   xmm2, [rel clear_low_nibble]
+        pshufd xmm2, xmm2, 0
+
+        psubd  xmm0, xmm2
+        psubd  xmm1, xmm2
+
+        movdqa [state + _lens + 0*16], xmm0
+        movdqa [state + _lens + 1*16], xmm1
+
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	md5_mb_x4x2_sse
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	mov	unused_lanes, [state + _unused_lanes]
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+	mov	dword [state + _lens + 4*idx], 0xFFFFFFFF
+	sub     dword [state + _num_lanes_inuse], 1
+
+	movd	xmm0, [state + _args_digest + 4*idx + 0*32]
+	pinsrd	xmm0, [state + _args_digest + 4*idx + 1*32], 1
+	pinsrd	xmm0, [state + _args_digest + 4*idx + 2*32], 2
+	pinsrd	xmm0, [state + _args_digest + 4*idx + 3*32], 3
+
+	movdqa	[job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+        movdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+        movdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+        movdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+        movdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+        movdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+        movdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+        movdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+        movdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+        movdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+        movdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+        mov     rsi, [rsp + _GPR_SAVE + 8*1]
+        mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+        mov     rbx, [rsp + _GPR_SAVE + 8*0]
+        mov     rbp, [rsp + _GPR_SAVE + 8*3]
+        mov     r12, [rsp + _GPR_SAVE + 8*4]
+        mov     r13, [rsp + _GPR_SAVE + 8*5]
+        mov     r14, [rsp + _GPR_SAVE + 8*6]
+        mov     r15, [rsp + _GPR_SAVE + 8*7]
+        add     rsp, STACK_SPACE
+
+        ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+one:	dq  1
+two:	dq  2
+three:	dq  3
+four:	dq  4
+five:	dq  5
+six:	dq  6
+seven:	dq  7
+
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..f41e5efbc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+
+void md5_mb_mgr_init_avx2(MD5_MB_JOB_MGR * state)
+{
+	unsigned int j;
+	state->unused_lanes[0] = 0xfedcba9876543210;
+	state->num_lanes_inuse = 0;
+	for (j = 0; j < 16; j++) {
+		state->lens[j] = 0xFFFFFFFF;
+		state->ldata[j].job_in_lane = 0;
+	}
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c
new file mode 100644
index 000000000..5ff02aa76
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c
@@ -0,0 +1,44 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+
+void md5_mb_mgr_init_avx512(MD5_MB_JOB_MGR * state)
+{
+	unsigned int j;
+	state->unused_lanes[0] = 0x0706050403020100;
+	state->unused_lanes[1] = 0x0f0e0d0c0b0a0908;
+	state->unused_lanes[2] = 0x1716151413121110;
+	state->unused_lanes[3] = 0x1f1e1d1c1b1a1918;
+	state->num_lanes_inuse = 0;
+	for (j = 0; j < 32; j++) {
+		state->lens[j] = 0xFFFFFFFF;
+		state->ldata[j].job_in_lane = 0;
+	}
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c
new file mode 100644
index 000000000..615cd9d76
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+
+void md5_mb_mgr_init_sse(MD5_MB_JOB_MGR * state)
+{
+	unsigned int j;
+	state->unused_lanes[0] = 0xF76543210;
+	state->num_lanes_inuse = 0;
+	for (j = 0; j < 8; j++) {
+		state->lens[j] = 0xFFFFFFFF;
+		state->ldata[j].job_in_lane = 0;
+	}
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm
new file mode 100644
index 000000000..96adcf614
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm
@@ -0,0 +1,228 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+extern md5_mb_x4x2_avx
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%else
+; UN*X register definitions
+%define arg1    rdi
+%define arg2    rsi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+; idx must be a register not clobberred by md5_mb_x4x2_avx
+%define idx             r8
+
+%define p               r9
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            r10
+
+%define lane_data       r11
+
+%endif ; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE     8*8 + 16*10 + 8
+
+; JOB* submit_job(MB_MGR *state, JOB_MD5 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global md5_mb_mgr_submit_avx, function
+md5_mb_mgr_submit_avx:
+	endbranch
+
+        sub     rsp, STACK_SPACE
+	; we need to save/restore all GPRs because lower layer clobbers them
+        mov     [rsp + 8*0], rbx
+        mov     [rsp + 8*1], rbp
+        mov     [rsp + 8*2], r12
+        mov     [rsp + 8*3], r13
+        mov     [rsp + 8*4], r14
+        mov     [rsp + 8*5], r15
+%ifidn __OUTPUT_FORMAT__, win64
+        mov     [rsp + 8*6], rsi
+        mov     [rsp + 8*7], rdi
+        vmovdqa  [rsp + 8*8 + 16*0], xmm6
+        vmovdqa  [rsp + 8*8 + 16*1], xmm7
+        vmovdqa  [rsp + 8*8 + 16*2], xmm8
+        vmovdqa  [rsp + 8*8 + 16*3], xmm9
+        vmovdqa  [rsp + 8*8 + 16*4], xmm10
+        vmovdqa  [rsp + 8*8 + 16*5], xmm11
+        vmovdqa  [rsp + 8*8 + 16*6], xmm12
+        vmovdqa  [rsp + 8*8 + 16*7], xmm13
+        vmovdqa  [rsp + 8*8 + 16*8], xmm14
+        vmovdqa  [rsp + 8*8 + 16*9], xmm15
+%endif
+
+        mov     unused_lanes, [state + _unused_lanes]
+        mov     lane, unused_lanes
+	and     lane, 0xF
+        shr     unused_lanes, 4
+        imul    lane_data, lane, _LANE_DATA_size
+        mov     dword [job + _status], STS_BEING_PROCESSED
+        lea     lane_data, [state + _ldata + lane_data]
+        mov     [state + _unused_lanes], unused_lanes
+        mov     DWORD(len), [job + _len]
+
+	shl	len, 4
+	or	len, lane
+
+        mov     [lane_data + _job_in_lane], job
+        mov     [state + _lens + 4*lane], DWORD(len)
+
+	; Load digest words from result_digest
+	vmovdqu	xmm0, [job + _result_digest + 0*16]
+        vmovd   [state + _args_digest + 4*lane + 0*32], xmm0
+        vpextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1
+        vpextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2
+        vpextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3
+
+        mov     p, [job + _buffer]
+        mov     [state + _args_data_ptr + 8*lane], p
+
+	add     dword [state + _num_lanes_inuse], 1
+        cmp     unused_lanes, 0xF
+        jne     return_null
+
+start_loop:
+        ; Find min length
+	vmovdqa xmm0, [state + _lens + 0*16]
+        vmovdqa xmm1, [state + _lens + 1*16]
+
+        vpminud xmm2, xmm0, xmm1        ; xmm2 has {D,C,B,A}
+        vpalignr xmm3, xmm3, xmm2, 8    ; xmm3 has {x,x,D,C}
+        vpminud xmm2, xmm2, xmm3        ; xmm2 has {x,x,E,F}
+        vpalignr xmm3, xmm3, xmm2, 4    ; xmm3 has {x,x,x,E}
+        vpminud xmm2, xmm2, xmm3        ; xmm2 has min value in low dword
+
+        vmovd   DWORD(idx), xmm2
+        mov	len2, idx
+        and	idx, 0xF
+        shr	len2, 4
+        jz	len_is_0
+
+        vpand   xmm2, xmm2, [rel clear_low_nibble]
+        vpshufd xmm2, xmm2, 0
+
+        vpsubd  xmm0, xmm0, xmm2
+        vpsubd  xmm1, xmm1, xmm2
+
+        vmovdqa [state + _lens + 0*16], xmm0
+        vmovdqa [state + _lens + 1*16], xmm1
+
+        ; "state" and "args" are the same address, arg1
+        ; len is arg2
+        call    md5_mb_x4x2_avx
+        ; state and idx are intact
+
+len_is_0:
+        ; process completed job "idx"
+        imul    lane_data, idx, _LANE_DATA_size
+        lea     lane_data, [state + _ldata + lane_data]
+
+        mov     job_rax, [lane_data + _job_in_lane]
+        mov     unused_lanes, [state + _unused_lanes]
+        mov     qword [lane_data + _job_in_lane], 0
+        mov     dword [job_rax + _status], STS_COMPLETED
+        shl     unused_lanes, 4
+        or      unused_lanes, idx
+        mov     [state + _unused_lanes], unused_lanes
+
+	mov	dword [state + _lens + 4*idx], 0xFFFFFFFF
+	sub     dword [state + _num_lanes_inuse], 1
+
+        vmovd    xmm0, [state + _args_digest + 4*idx + 0*32]
+        vpinsrd  xmm0, [state + _args_digest + 4*idx + 1*32], 1
+        vpinsrd  xmm0, [state + _args_digest + 4*idx + 2*32], 2
+        vpinsrd  xmm0, [state + _args_digest + 4*idx + 3*32], 3
+
+        vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+        vmovdqa  xmm6, [rsp + 8*8 + 16*0]
+        vmovdqa  xmm7, [rsp + 8*8 + 16*1]
+        vmovdqa  xmm8, [rsp + 8*8 + 16*2]
+        vmovdqa  xmm9, [rsp + 8*8 + 16*3]
+        vmovdqa  xmm10, [rsp + 8*8 + 16*4]
+        vmovdqa  xmm11, [rsp + 8*8 + 16*5]
+        vmovdqa  xmm12, [rsp + 8*8 + 16*6]
+        vmovdqa  xmm13, [rsp + 8*8 + 16*7]
+        vmovdqa  xmm14, [rsp + 8*8 + 16*8]
+        vmovdqa  xmm15, [rsp + 8*8 + 16*9]
+        mov     rsi, [rsp + 8*6]
+        mov     rdi, [rsp + 8*7]
+%endif
+        mov     rbx, [rsp + 8*0]
+        mov     rbp, [rsp + 8*1]
+        mov     r12, [rsp + 8*2]
+        mov     r13, [rsp + 8*3]
+        mov     r14, [rsp + 8*4]
+        mov     r15, [rsp + 8*5]
+
+        add     rsp, STACK_SPACE
+
+        ret
+
+return_null:
+        xor     job_rax, job_rax
+        jmp     return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..ed9b0588e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm
@@ -0,0 +1,239 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x8x2_avx2
+
+[bits 64]
+default rel
+section .text
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%define lane            rsi
+
+%else
+; UN*X register definitions
+%define arg1    rdi
+%define arg2    rsi
+
+%define lane            rdx
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+; idx needs to be in a register not clobberred by md5_mb_x8x2_avx2
+%define idx             rbp
+
+%define p               r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define num_lanes_inuse r9
+
+%define lane_data       r10
+
+%endif ; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE     8*8 + 16*10 + 8
+
+; JOB* submit_job(MB_MGR *state, JOB_MD5 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global md5_mb_mgr_submit_avx2, function
+md5_mb_mgr_submit_avx2:
+	endbranch
+
+        sub     rsp, STACK_SPACE
+	; we need to save/restore all GPRs because lower layer clobbers them
+        mov     [rsp + 8*0], rbx
+        mov     [rsp + 8*1], rbp
+        mov     [rsp + 8*2], r12
+        mov     [rsp + 8*3], r13
+        mov     [rsp + 8*4], r14
+        mov     [rsp + 8*5], r15
+%ifidn __OUTPUT_FORMAT__, win64
+        mov     [rsp + 8*6], rsi
+        mov     [rsp + 8*7], rdi
+        vmovdqa  [rsp + 8*8 + 16*0], xmm6
+        vmovdqa  [rsp + 8*8 + 16*1], xmm7
+        vmovdqa  [rsp + 8*8 + 16*2], xmm8
+        vmovdqa  [rsp + 8*8 + 16*3], xmm9
+        vmovdqa  [rsp + 8*8 + 16*4], xmm10
+        vmovdqa  [rsp + 8*8 + 16*5], xmm11
+        vmovdqa  [rsp + 8*8 + 16*6], xmm12
+        vmovdqa  [rsp + 8*8 + 16*7], xmm13
+        vmovdqa  [rsp + 8*8 + 16*8], xmm14
+        vmovdqa  [rsp + 8*8 + 16*9], xmm15
+%endif
+
+        mov     unused_lanes, [state + _unused_lanes]
+        mov     lane, unused_lanes
+	and     lane, 0xF
+        shr     unused_lanes, 4
+        imul    lane_data, lane, _LANE_DATA_size
+        mov     dword [job + _status], STS_BEING_PROCESSED
+        lea     lane_data, [state + _ldata + lane_data]
+        mov     [state + _unused_lanes], unused_lanes
+        mov     DWORD(len), [job + _len]
+
+	shl	len, 4
+	or	len, lane
+
+        mov     [lane_data + _job_in_lane], job
+        mov     [state + _lens + 4*lane], DWORD(len)
+
+	; Load digest words from result_digest
+	vmovdqu	xmm0, [job + _result_digest + 0*16]
+        vmovd   [state + _args_digest + 4*lane + 0*64], xmm0
+        vpextrd [state + _args_digest + 4*lane + 1*64], xmm0, 1
+        vpextrd [state + _args_digest + 4*lane + 2*64], xmm0, 2
+        vpextrd [state + _args_digest + 4*lane + 3*64], xmm0, 3
+
+        mov     p, [job + _buffer]
+        mov     [state + _args_data_ptr + 8*lane], p
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        add     num_lanes_inuse, 1
+	mov	[state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+        cmp     num_lanes_inuse, 16
+        jne     return_null
+
+start_loop:
+        ; Find min length
+	vmovdqu ymm0, [state + _lens + 0*32]
+        vmovdqu ymm1, [state + _lens + 1*32]
+
+        vpminud ymm2, ymm0, ymm1        ; ymm2 has {D,C,B,A}
+        vpalignr ymm3, ymm3, ymm2, 8    ; ymm3 has {x,x,D,C}
+        vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x,E,F}
+        vpalignr ymm3, ymm3, ymm2, 4    ; ymm3 has {x,x,x,E}
+        vpminud ymm2, ymm2, ymm3        ; ymm2 has min value in low dword
+	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has halves of ymm2 reversed
+        vpminud ymm2, ymm2, ymm3        ; ymm2 has min value in low dword
+
+        vmovd   DWORD(idx), xmm2
+        mov	len2, idx
+        and	idx, 0xF
+        shr	len2, 4
+        jz	len_is_0
+
+        vpand   ymm2, ymm2, [rel clear_low_nibble]
+        vpshufd ymm2, ymm2, 0
+
+        vpsubd  ymm0, ymm0, ymm2
+        vpsubd  ymm1, ymm1, ymm2
+
+        vmovdqu [state + _lens + 0*32], ymm0
+        vmovdqu [state + _lens + 1*32], ymm1
+
+        ; "state" and "args" are the same address, arg1
+        ; len is arg2
+        call    md5_mb_x8x2_avx2
+        ; state and idx are intact
+
+len_is_0:
+        ; process completed job "idx"
+        imul    lane_data, idx, _LANE_DATA_size
+        lea     lane_data, [state + _ldata + lane_data]
+
+        mov     job_rax, [lane_data + _job_in_lane]
+        mov     unused_lanes, [state + _unused_lanes]
+        mov     qword [lane_data + _job_in_lane], 0
+        mov     dword [job_rax + _status], STS_COMPLETED
+        shl     unused_lanes, 4
+        or      unused_lanes, idx
+        mov     [state + _unused_lanes], unused_lanes
+
+        mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        sub     num_lanes_inuse, 1
+        mov     [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+	mov	dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+        vmovd    xmm0, [state + _args_digest + 4*idx + 0*64]
+        vpinsrd  xmm0, [state + _args_digest + 4*idx + 1*64], 1
+        vpinsrd  xmm0, [state + _args_digest + 4*idx + 2*64], 2
+        vpinsrd  xmm0, [state + _args_digest + 4*idx + 3*64], 3
+
+        vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+        vmovdqa  xmm6, [rsp + 8*8 + 16*0]
+        vmovdqa  xmm7, [rsp + 8*8 + 16*1]
+        vmovdqa  xmm8, [rsp + 8*8 + 16*2]
+        vmovdqa  xmm9, [rsp + 8*8 + 16*3]
+        vmovdqa  xmm10, [rsp + 8*8 + 16*4]
+        vmovdqa  xmm11, [rsp + 8*8 + 16*5]
+        vmovdqa  xmm12, [rsp + 8*8 + 16*6]
+        vmovdqa  xmm13, [rsp + 8*8 + 16*7]
+        vmovdqa  xmm14, [rsp + 8*8 + 16*8]
+        vmovdqa  xmm15, [rsp + 8*8 + 16*9]
+        mov     rsi, [rsp + 8*6]
+        mov     rdi, [rsp + 8*7]
+%endif
+        mov     rbx, [rsp + 8*0]
+        mov     rbp, [rsp + 8*1]
+        mov     r12, [rsp + 8*2]
+        mov     r13, [rsp + 8*3]
+        mov     r14, [rsp + 8*4]
+        mov     r15, [rsp + 8*5]
+
+        add     rsp, STACK_SPACE
+
+        ret
+
+return_null:
+        xor     job_rax, job_rax
+        jmp     return
+
+
+section .data align=32
+
+align 32
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..1bbc2be2c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm
@@ -0,0 +1,283 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+extern md5_mb_x16x2_avx512
+
+[bits 64]
+default rel
+section .text
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%define lane	rsi
+
+%else
+; UN*X register definitions
+%define arg1    rdi
+%define arg2    rsi
+
+%define lane	rdx
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+; idx needs to be in a register not clobberred by md5_mb_x16_avx512
+%define idx	rbp
+
+%define p	r11
+
+%define unused_lanes	ymm7
+
+%define job_rax	rax
+%define len	rax
+
+%define num_lanes_inuse r9
+
+%define lane_data	r10
+
+%endif ; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE	8*8 + 16*10 + 8
+
+;; Byte shift in MEM addr, read a extra byte [addr+16]
+%macro MEM_VPSRLDDQ 2
+%define %%addr	  %1
+%define %%TMP_YMM %2
+	vmovdqu	%%TMP_YMM, [%%addr + 1]
+	vmovdqu [%%addr], %%TMP_YMM
+	mov	[%%addr + 31], byte 0
+%endmacro
+
+;; Byte shift in MEM addr, read a extra byte [addr-1]
+%macro MEM_VPSLLDDQ 2
+%define %%addr	  %1
+%define %%TMP_YMM %2
+	vmovdqu	%%TMP_YMM, [%%addr-1]
+	vmovdqu [%%addr], %%TMP_YMM
+	mov	[%%addr], byte 0
+%endmacro
+
+align 64
+
+; JOB* submit_job(MB_MGR *state, JOB_MD5 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global md5_mb_mgr_submit_avx512, function
+md5_mb_mgr_submit_avx512:
+	endbranch
+
+	sub	rsp, STACK_SPACE
+	; we need to save/restore all GPRs because lower layer clobbers them
+	mov	[rsp + 8*0], rbx
+	mov	[rsp + 8*1], rbp
+	mov	[rsp + 8*2], r12
+	mov	[rsp + 8*3], r13
+	mov	[rsp + 8*4], r14
+	mov	[rsp + 8*5], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov	[rsp + 8*6], rsi
+	mov	[rsp + 8*7], rdi
+	vmovdqa  [rsp + 8*8 + 16*0], xmm6
+	vmovdqa  [rsp + 8*8 + 16*1], xmm7
+	vmovdqa  [rsp + 8*8 + 16*2], xmm8
+	vmovdqa  [rsp + 8*8 + 16*3], xmm9
+	vmovdqa  [rsp + 8*8 + 16*4], xmm10
+	vmovdqa  [rsp + 8*8 + 16*5], xmm11
+	vmovdqa  [rsp + 8*8 + 16*6], xmm12
+	vmovdqa  [rsp + 8*8 + 16*7], xmm13
+	vmovdqa  [rsp + 8*8 + 16*8], xmm14
+	vmovdqa  [rsp + 8*8 + 16*9], xmm15
+%endif
+
+	mov	lane, [state + _unused_lanes]
+	and	lane, 0x3F
+	MEM_VPSRLDDQ (state + _unused_lanes), unused_lanes
+	imul	lane_data, lane, _LANE_DATA_size
+	mov	dword [job + _status], STS_BEING_PROCESSED
+	lea	lane_data, [state + _ldata + lane_data]
+	mov	DWORD(len), [job + _len]
+
+	shl	len, 6	; low 5 bits store idx
+	or	len, lane
+
+	mov	[lane_data + _job_in_lane], job
+	mov	[state + _lens + 4*lane], DWORD(len)
+
+	; Load digest words from result_digest
+	vmovdqu	xmm0, [job + _result_digest + 0*16]
+	vmovd	[state + _args_digest + 4*lane + 0*4*16*2], xmm0
+	vpextrd [state + _args_digest + 4*lane + 1*4*16*2], xmm0, 1
+	vpextrd [state + _args_digest + 4*lane + 2*4*16*2], xmm0, 2
+	vpextrd [state + _args_digest + 4*lane + 3*4*16*2], xmm0, 3
+
+	mov	p, [job + _buffer]
+	mov	[state + _args_data_ptr + 8*lane], p
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	add	num_lanes_inuse, 1
+	mov	[state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+	cmp	num_lanes_inuse, 32
+	jne	return_null
+
+start_loop:
+	; Find min length
+	vmovdqu ymm0, [state + _lens + 0*32]
+	vmovdqu ymm1, [state + _lens + 1*32]
+
+	vpminud ymm2, ymm0, ymm1	; ymm2 has {D,C,B,A}
+	vpalignr ymm3, ymm3, ymm2, 8    ; ymm3 has {x,x,D,C}
+	vpminud ymm2, ymm2, ymm3	; ymm2 has {x,x,E,F}
+	vpalignr ymm3, ymm3, ymm2, 4    ; ymm3 has {x,x,x,E}
+	vpminud ymm2, ymm2, ymm3	; ymm2 has min value in low dword
+	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has halves of ymm2 reversed
+	vpminud ymm2, ymm2, ymm3	; ymm2 has min value in low dword
+
+	; Find min length
+	vmovdqu ymm5, [state + _lens + 2*32]
+	vmovdqu ymm6, [state + _lens + 3*32]
+
+	vpminud ymm4, ymm5, ymm6	; ymm4 has {D,C,B,A}
+	vpalignr ymm3, ymm3, ymm4, 8    ; ymm3 has {x,x,D,C}
+	vpminud ymm4, ymm4, ymm3	; ymm4 has {x,x,E,F}
+	vpalignr ymm3, ymm3, ymm4, 4    ; ymm3 has {x,x,x,E}
+	vpminud ymm4, ymm4, ymm3	; ymm4 has min value in low dword
+	vperm2i128 ymm3, ymm4, ymm4, 1	; ymm3 has halves of ymm4 reversed
+	vpminud ymm4, ymm4, ymm3	; ymm4 has min value in low dword
+
+	vpminud ymm2, ymm2, ymm4	; ymm2 has min value in low dword
+	vmovd	DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0x3F
+	shr	len2, 6
+	jz	len_is_0
+
+	vpand	ymm2, ymm2, [rel clear_low_6bits]
+	vpshufd ymm2, ymm2, 0
+
+	vpsubd  ymm0, ymm0, ymm2
+	vpsubd  ymm1, ymm1, ymm2
+	vpsubd  ymm5, ymm5, ymm2
+	vpsubd  ymm6, ymm6, ymm2
+
+	vmovdqu [state + _lens + 0*32], ymm0
+	vmovdqu [state + _lens + 1*32], ymm1
+	vmovdqu [state + _lens + 2*32], ymm5
+	vmovdqu [state + _lens + 3*32], ymm6
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	md5_mb_x16x2_avx512
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	lane, [state + _unused_lanes]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+
+	shl	lane, 8
+	or	 lane, idx
+	MEM_VPSLLDDQ	(state + _unused_lanes), unused_lanes
+	mov	[state + _unused_lanes], lane
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	sub	num_lanes_inuse, 1
+	mov	[state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+	mov	dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+	vmovd	 xmm0, [state + _args_digest + 4*idx + 0*4*16*2]
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 1*4*16*2], 1
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 2*4*16*2], 2
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 3*4*16*2], 3
+
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + 8*8 + 16*0]
+	vmovdqa  xmm7, [rsp + 8*8 + 16*1]
+	vmovdqa  xmm8, [rsp + 8*8 + 16*2]
+	vmovdqa  xmm9, [rsp + 8*8 + 16*3]
+	vmovdqa  xmm10, [rsp + 8*8 + 16*4]
+	vmovdqa  xmm11, [rsp + 8*8 + 16*5]
+	vmovdqa  xmm12, [rsp + 8*8 + 16*6]
+	vmovdqa  xmm13, [rsp + 8*8 + 16*7]
+	vmovdqa  xmm14, [rsp + 8*8 + 16*8]
+	vmovdqa  xmm15, [rsp + 8*8 + 16*9]
+	mov	rsi, [rsp + 8*6]
+	mov	rdi, [rsp + 8*7]
+%endif
+	mov	rbx, [rsp + 8*0]
+	mov	rbp, [rsp + 8*1]
+	mov	r12, [rsp + 8*2]
+	mov	r13, [rsp + 8*3]
+	mov	r14, [rsp + 8*4]
+	mov	r15, [rsp + 8*5]
+
+	add	rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+
+section .data align=32
+
+align 32
+clear_low_6bits:
+	dq 0x00000000FFFFFFC0, 0x0000000000000000
+	dq 0x00000000FFFFFFC0, 0x0000000000000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_md5_mb_mgr_submit_avx512
+no_md5_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm
new file mode 100644
index 000000000..2a374c7e3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm
@@ -0,0 +1,229 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x4x2_sse
+
+[bits 64]
+default rel
+section .text
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%else
+; UN*X register definitions
+%define arg1    rdi
+%define arg2    rsi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+; idx must be a register not clobberred by md5_mb_x4x2_sse
+%define idx             r8
+
+%define p               r9
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            r10
+
+%define lane_data       r11
+
+%endif ; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE     8*8 + 16*10 + 8
+
+; JOB* submit_job(MB_MGR *state, JOB_MD5 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global md5_mb_mgr_submit_sse, function
+md5_mb_mgr_submit_sse:
+	endbranch
+
+        sub     rsp, STACK_SPACE
+	; we need to save/restore all GPRs because lower layer clobbers them
+        mov     [rsp + 8*0], rbx
+        mov     [rsp + 8*1], rbp
+        mov     [rsp + 8*2], r12
+        mov     [rsp + 8*3], r13
+        mov     [rsp + 8*4], r14
+        mov     [rsp + 8*5], r15
+%ifidn __OUTPUT_FORMAT__, win64
+        mov     [rsp + 8*6], rsi
+        mov     [rsp + 8*7], rdi
+        movdqa  [rsp + 8*8 + 16*0], xmm6
+        movdqa  [rsp + 8*8 + 16*1], xmm7
+        movdqa  [rsp + 8*8 + 16*2], xmm8
+        movdqa  [rsp + 8*8 + 16*3], xmm9
+        movdqa  [rsp + 8*8 + 16*4], xmm10
+        movdqa  [rsp + 8*8 + 16*5], xmm11
+        movdqa  [rsp + 8*8 + 16*6], xmm12
+        movdqa  [rsp + 8*8 + 16*7], xmm13
+        movdqa  [rsp + 8*8 + 16*8], xmm14
+        movdqa  [rsp + 8*8 + 16*9], xmm15
+%endif
+
+        mov     unused_lanes, [state + _unused_lanes]
+        mov     lane, unused_lanes
+	and     lane, 0xF
+        shr     unused_lanes, 4
+        imul    lane_data, lane, _LANE_DATA_size
+        mov     dword [job + _status], STS_BEING_PROCESSED
+        lea     lane_data, [state + _ldata + lane_data]
+        mov     [state + _unused_lanes], unused_lanes
+        mov     DWORD(len), [job + _len]
+
+	shl	len, 4
+	or	len, lane
+
+        mov     [lane_data + _job_in_lane], job
+        mov     [state + _lens + 4*lane], DWORD(len)
+
+	; Load digest words from result_digest
+	movdqu	xmm0, [job + _result_digest + 0*16]
+        movd    [state + _args_digest + 4*lane + 0*32], xmm0
+        pextrd  [state + _args_digest + 4*lane + 1*32], xmm0, 1
+        pextrd  [state + _args_digest + 4*lane + 2*32], xmm0, 2
+        pextrd  [state + _args_digest + 4*lane + 3*32], xmm0, 3
+
+        mov     p, [job + _buffer]
+        mov     [state + _args_data_ptr + 8*lane], p
+
+	add     dword [state + _num_lanes_inuse], 1
+        cmp     unused_lanes, 0xF
+        jne     return_null
+
+start_loop:
+        ; Find min length
+	movdqa	xmm0, [state + _lens + 0*16]
+        movdqa	xmm1, [state + _lens + 1*16]
+
+	movdqa	xmm2, xmm0
+        pminud 	xmm2, xmm1        ; xmm2 has {D,C,B,A}
+        palignr xmm3, xmm2, 8     ; xmm3 has {x,x,D,C}
+        pminud	xmm2, xmm3        ; xmm2 has {x,x,E,F}
+        palignr xmm3, xmm2, 4     ; xmm3 has {x,x,x,E}
+        pminud	xmm2, xmm3        ; xmm2 has min value in low dword
+
+        movd   DWORD(idx), xmm2
+        mov	len2, idx
+        and	idx, 0xF
+        shr	len2, 4
+        jz	len_is_0
+
+        pand   xmm2, [rel clear_low_nibble]
+        pshufd xmm2, xmm2, 0
+
+        psubd  xmm0, xmm2
+        psubd  xmm1, xmm2
+
+        movdqa [state + _lens + 0*16], xmm0
+        movdqa [state + _lens + 1*16], xmm1
+
+        ; "state" and "args" are the same address, arg1
+        ; len is arg2
+        call    md5_mb_x4x2_sse
+        ; state and idx are intact
+
+len_is_0:
+        ; process completed job "idx"
+        imul    lane_data, idx, _LANE_DATA_size
+        lea     lane_data, [state + _ldata + lane_data]
+
+        mov     job_rax, [lane_data + _job_in_lane]
+        mov     unused_lanes, [state + _unused_lanes]
+        mov     qword [lane_data + _job_in_lane], 0
+        mov     dword [job_rax + _status], STS_COMPLETED
+        shl     unused_lanes, 4
+        or      unused_lanes, idx
+        mov     [state + _unused_lanes], unused_lanes
+
+	mov	dword [state + _lens + 4*idx], 0xFFFFFFFF
+	sub     dword [state + _num_lanes_inuse], 1
+
+        movd    xmm0, [state + _args_digest + 4*idx + 0*32]
+        pinsrd  xmm0, [state + _args_digest + 4*idx + 1*32], 1
+        pinsrd  xmm0, [state + _args_digest + 4*idx + 2*32], 2
+        pinsrd  xmm0, [state + _args_digest + 4*idx + 3*32], 3
+
+        movdqa  [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+        movdqa  xmm6, [rsp + 8*8 + 16*0]
+        movdqa  xmm7, [rsp + 8*8 + 16*1]
+        movdqa  xmm8, [rsp + 8*8 + 16*2]
+        movdqa  xmm9, [rsp + 8*8 + 16*3]
+        movdqa  xmm10, [rsp + 8*8 + 16*4]
+        movdqa  xmm11, [rsp + 8*8 + 16*5]
+        movdqa  xmm12, [rsp + 8*8 + 16*6]
+        movdqa  xmm13, [rsp + 8*8 + 16*7]
+        movdqa  xmm14, [rsp + 8*8 + 16*8]
+        movdqa  xmm15, [rsp + 8*8 + 16*9]
+        mov     rsi, [rsp + 8*6]
+        mov     rdi, [rsp + 8*7]
+%endif
+        mov     rbx, [rsp + 8*0]
+        mov     rbp, [rsp + 8*1]
+        mov     r12, [rsp + 8*2]
+        mov     r13, [rsp + 8*3]
+        mov     r14, [rsp + 8*4]
+        mov     r15, [rsp + 8*5]
+
+        add     rsp, STACK_SPACE
+
+        ret
+
+return_null:
+        xor     job_rax, job_rax
+        jmp     return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c
new file mode 100644
index 000000000..bba868f1a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c
@@ -0,0 +1,159 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/md5.h>
+#include "md5_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * MD5_DIGEST_NWORDS];
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	MD5_HASH_CTX_MGR *mgr = NULL;
+	MD5_HASH_CTX ctxpool[TEST_BUFS];
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t i, j, fail = 0;
+	uint32_t lens[TEST_BUFS];
+	unsigned int jobs, t;
+	int ret;
+
+	printf("multibinary_md5 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+	srand(TEST_SEED);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	md5_ctx_mgr_init(mgr);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocate and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// SSL test
+		MD5(bufs[i], TEST_LEN, digest_ssl[i]);
+
+		// sb_md5 test
+		md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+	}
+
+	while (md5_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_le32(((uint32_t *) digest_ssl[i])[j])) {
+				fail++;
+				printf("Test%d, digest%d fail %08X <=> %08X\n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       to_le32(((uint32_t *) digest_ssl[i])[j]));
+			}
+		}
+	}
+	putchar('.');
+
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		md5_ctx_mgr_init(mgr);
+
+		for (i = 0; i < jobs; i++) {
+			// Random buffer with random len and contents
+			lens[i] = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], lens[i]);
+
+			// Run SSL test
+			MD5(bufs[i], lens[i], digest_ssl[i]);
+
+			// Run sb_md5 test
+			md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+		}
+
+		while (md5_ctx_mgr_flush(mgr)) ;
+
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] !=
+				    to_le32(((uint32_t *) digest_ssl[i])[j])) {
+					fail++;
+					printf("Test%d, digest%d fail %08X <=> %08X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       to_le32(((uint32_t *) digest_ssl[i])[j]));
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_md5_ssl rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c
new file mode 100644
index 000000000..d19246138
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c
@@ -0,0 +1,202 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "md5_mb.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][MD5_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void md5_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	MD5_HASH_CTX_MGR *mgr = NULL;
+	MD5_HASH_CTX ctxpool[TEST_BUFS];
+	uint32_t i, j, fail = 0;
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t lens[TEST_BUFS];
+	unsigned int jobs, t;
+	uint8_t *tmp_buf;
+	int ret;
+
+	printf("multibinary_md5 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	md5_ctx_mgr_init(mgr);
+
+	srand(TEST_SEED);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocate  and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contexts
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// Run reference test
+		md5_ref(bufs[i], digest_ref[i], TEST_LEN);
+
+		// Run sb_md5 test
+		md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+	}
+
+	while (md5_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("Test%d fixed size, digest%d "
+				       "fail 0x%08X <=> 0x%08X \n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+
+	if (fail) {
+		printf("Test failed function check %d\n", fail);
+		return fail;
+	}
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		md5_ctx_mgr_init(mgr);
+
+		for (i = 0; i < jobs; i++) {
+			// Use buffer with random len and contents
+			lens[i] = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], lens[i]);
+
+			// Run reference test
+			md5_ref(bufs[i], digest_ref[i], lens[i]);
+
+			// Run md5_mb test
+			md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+		}
+
+		while (md5_ctx_mgr_flush(mgr)) ;
+
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+					fail++;
+					printf("Test%d, digest%d fail "
+					       "0x%08X <=> 0x%08X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       digest_ref[i][j]);
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	// Test at the end of buffer
+	jobs = rand() % TEST_BUFS;
+	tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+	if (!tmp_buf) {
+		printf("malloc failed, end test aborted.\n");
+		return 1;
+	}
+
+	rand_buffer(tmp_buf, jobs);
+
+	md5_ctx_mgr_init(mgr);
+
+	// Extend to the end of allocated buffer to construct jobs
+	for (i = 0; i < jobs; i++) {
+		bufs[i] = (uint8_t *) & tmp_buf[i];
+		lens[i] = jobs - i;
+
+		// Reference test
+		md5_ref(bufs[i], digest_ref[i], lens[i]);
+
+		// sb_md5 test
+		md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+	}
+
+	while (md5_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < jobs; i++) {
+		for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("End test failed at offset %d - result: 0x%08X"
+				       ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+
+	putchar('.');
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_md5 rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c
new file mode 100644
index 000000000..2eab61dfa
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c
@@ -0,0 +1,297 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "md5_mb.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE		13*MD5_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS 	(TEST_LEN/(16*MD5_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint32_t digest_ref[TEST_BUFS][MD5_DIGEST_NWORDS];
+
+extern void md5_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	MD5_HASH_CTX_MGR *mgr = NULL;
+	MD5_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+	uint32_t i, j, fail = 0;
+	int len_done, len_rem, len_rand;
+	unsigned char *bufs[TEST_BUFS];
+	unsigned char *buf_ptr[TEST_BUFS];
+	uint32_t lens[TEST_BUFS];
+	unsigned int joblen, jobs, t;
+	int ret;
+
+	printf("multibinary_md5_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+	       TEST_LEN);
+
+	srand(TEST_SEED);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	md5_ctx_mgr_init(mgr);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocte and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		buf_ptr[i] = bufs[i];
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// Run reference test
+		md5_ref(bufs[i], digest_ref[i], TEST_LEN);
+	}
+
+	// Run sb_md5 tests
+	for (i = 0; i < TEST_BUFS;) {
+		len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+		len_rem = TEST_LEN - len_done;
+
+		if (len_done == 0)
+			ctx = md5_ctx_mgr_submit(mgr,
+						 &ctxpool[i],
+						 buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+		else if (len_rem <= UPDATE_SIZE)
+			ctx = md5_ctx_mgr_submit(mgr,
+						 &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+		else
+			ctx = md5_ctx_mgr_submit(mgr,
+						 &ctxpool[i],
+						 buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+		// Add jobs while available or finished
+		if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+			i++;
+			continue;
+		}
+		// Resubmit unfinished job
+		i = (unsigned long)(ctx->user_data);
+		buf_ptr[i] += UPDATE_SIZE;
+	}
+
+	// Start flushing finished jobs, end on last flushed
+	ctx = md5_ctx_mgr_flush(mgr);
+	while (ctx) {
+		if (hash_ctx_complete(ctx)) {
+			debug_char('-');
+			ctx = md5_ctx_mgr_flush(mgr);
+			continue;
+		}
+		// Resubmit unfinished job
+		i = (unsigned long)(ctx->user_data);
+		buf_ptr[i] += UPDATE_SIZE;
+
+		len_done = (int)((unsigned long)buf_ptr[i]
+				 - (unsigned long)bufs[i]);
+		len_rem = TEST_LEN - len_done;
+
+		if (len_rem <= UPDATE_SIZE)
+			ctx = md5_ctx_mgr_submit(mgr,
+						 &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+		else
+			ctx = md5_ctx_mgr_submit(mgr,
+						 &ctxpool[i],
+						 buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+		if (ctx == NULL)
+			ctx = md5_ctx_mgr_flush(mgr);
+	}
+
+	// Check digests
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("Test%d fixed size, digest%d fail %8X <=> %8X",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+	putchar('.');
+
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		for (i = 0; i < jobs; i++) {
+			joblen = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], joblen);
+			lens[i] = joblen;
+			buf_ptr[i] = bufs[i];
+			md5_ref(bufs[i], digest_ref[i], lens[i]);
+		}
+
+		md5_ctx_mgr_init(mgr);
+
+		// Run md5_sb jobs
+		i = 0;
+		while (i < jobs) {
+			// Submit a new job
+			len_rand = MD5_BLOCK_SIZE +
+			    MD5_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+			if (lens[i] > len_rand)
+				ctx = md5_ctx_mgr_submit(mgr,
+							 &ctxpool[i],
+							 buf_ptr[i], len_rand, HASH_FIRST);
+			else
+				ctx = md5_ctx_mgr_submit(mgr,
+							 &ctxpool[i],
+							 buf_ptr[i], lens[i], HASH_ENTIRE);
+
+			// Returned ctx could be:
+			//  - null context (we are just getting started and lanes aren't full yet), or
+			//  - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+			//  - an unfinished ctx, we will resubmit
+
+			if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+				i++;
+				continue;
+			} else {
+				// unfinished ctx returned, choose another random update length and submit either
+				// UPDATE or LAST depending on the amount of buffer remaining
+				while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+					j = (unsigned long)(ctx->user_data);	// Get index of the returned ctx
+					buf_ptr[j] = bufs[j] + ctx->total_length;
+					len_rand = (rand() % MD5_BLOCK_SIZE)
+					    * (rand() % MAX_RAND_UPDATE_BLOCKS);
+					len_rem = lens[j] - ctx->total_length;
+
+					if (len_rem <= len_rand)	// submit the rest of the job as LAST
+						ctx = md5_ctx_mgr_submit(mgr,
+									 &ctxpool[j],
+									 buf_ptr[j],
+									 len_rem, HASH_LAST);
+					else	// submit the random update length as UPDATE
+						ctx = md5_ctx_mgr_submit(mgr,
+									 &ctxpool[j],
+									 buf_ptr[j],
+									 len_rand,
+									 HASH_UPDATE);
+				}	// Either continue submitting any contexts returned here as UPDATE/LAST, or
+				// go back to submitting new jobs using the index i.
+
+				i++;
+			}
+		}
+
+		// Start flushing finished jobs, end on last flushed
+		ctx = md5_ctx_mgr_flush(mgr);
+		while (ctx) {
+			if (hash_ctx_complete(ctx)) {
+				debug_char('-');
+				ctx = md5_ctx_mgr_flush(mgr);
+				continue;
+			}
+			// Resubmit unfinished job
+			i = (unsigned long)(ctx->user_data);
+			buf_ptr[i] = bufs[i] + ctx->total_length;	// update buffer pointer
+			len_rem = lens[i] - ctx->total_length;
+			len_rand = (rand() % MD5_BLOCK_SIZE)
+			    * (rand() % MAX_RAND_UPDATE_BLOCKS);
+			debug_char('+');
+			if (len_rem <= len_rand)
+				ctx = md5_ctx_mgr_submit(mgr,
+							 &ctxpool[i],
+							 buf_ptr[i], len_rem, HASH_LAST);
+			else
+				ctx = md5_ctx_mgr_submit(mgr,
+							 &ctxpool[i],
+							 buf_ptr[i], len_rand, HASH_UPDATE);
+
+			if (ctx == NULL)
+				ctx = md5_ctx_mgr_flush(mgr);
+		}
+
+		// Check result digest
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+					fail++;
+					printf("Test%d, digest%d fail %8X <=> %8X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       digest_ref[i][j]);
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_md5_update rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c
new file mode 100644
index 000000000..4f84b6723
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c
@@ -0,0 +1,229 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "md5_mb.h"
+
+typedef uint32_t DigestMD5[MD5_DIGEST_NWORDS];
+
+#define MSGS 13
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+static uint8_t msg1[] = "Test vector from febooti.com";
+static uint8_t msg2[] = "12345678901234567890" "12345678901234567890"
+    "12345678901234567890" "12345678901234567890";
+static uint8_t msg3[] = "";
+static uint8_t msg4[] = "abcdefghijklmnopqrstuvwxyz";
+static uint8_t msg5[] = "message digest";
+static uint8_t msg6[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz0123456789";
+static uint8_t msg7[] = "abc";
+static uint8_t msg8[] = "a";
+
+static uint8_t msg9[] = "";
+static uint8_t msgA[] = "abcdefghijklmnopqrstuvwxyz";
+static uint8_t msgB[] = "message digest";
+static uint8_t msgC[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz0123456789";
+static uint8_t msgD[] = "abc";
+
+static DigestMD5 expResultDigest1 = { 0x61b60a50, 0xfbb76d3c, 0xf5620cd3, 0x0f3d57ff };
+static DigestMD5 expResultDigest2 = { 0xa2f4ed57, 0x55c9e32b, 0x2eda49ac, 0x7ab60721 };
+static DigestMD5 expResultDigest3 = { 0xd98c1dd4, 0x04b2008f, 0x980980e9, 0x7e42f8ec };
+static DigestMD5 expResultDigest4 = { 0xd7d3fcc3, 0x00e49261, 0x6c49fb7d, 0x3be167ca };
+static DigestMD5 expResultDigest5 = { 0x7d696bf9, 0x8d93b77c, 0x312f5a52, 0xd061f1aa };
+static DigestMD5 expResultDigest6 = { 0x98ab74d1, 0xf5d977d2, 0x2c1c61a5, 0x9f9d419f };
+static DigestMD5 expResultDigest7 = { 0x98500190, 0xb04fd23c, 0x7d3f96d6, 0x727fe128 };
+static DigestMD5 expResultDigest8 = { 0xb975c10c, 0xa8b6f1c0, 0xe299c331, 0x61267769 };
+
+static DigestMD5 expResultDigest9 = { 0xd98c1dd4, 0x04b2008f, 0x980980e9, 0x7e42f8ec };
+static DigestMD5 expResultDigestA = { 0xd7d3fcc3, 0x00e49261, 0x6c49fb7d, 0x3be167ca };
+static DigestMD5 expResultDigestB = { 0x7d696bf9, 0x8d93b77c, 0x312f5a52, 0xd061f1aa };
+static DigestMD5 expResultDigestC = { 0x98ab74d1, 0xf5d977d2, 0x2c1c61a5, 0x9f9d419f };
+static DigestMD5 expResultDigestD = { 0x98500190, 0xb04fd23c, 0x7d3f96d6, 0x727fe128 };
+
+static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7, msg8, msg9,
+	msgA, msgB, msgC, msgD
+};
+
+static uint32_t *expResultDigest[MSGS] = {
+	expResultDigest1, expResultDigest2, expResultDigest3,
+	expResultDigest4, expResultDigest5, expResultDigest6,
+	expResultDigest7, expResultDigest8, expResultDigest9,
+	expResultDigestA, expResultDigestB, expResultDigestC,
+	expResultDigestD
+};
+
+int main(void)
+{
+	MD5_HASH_CTX_MGR *mgr = NULL;
+	MD5_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+	uint32_t i, j, k, t, checked = 0;
+	uint32_t *good;
+	int ret;
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	md5_ctx_mgr_init(mgr);
+
+	// Init contexts before first use
+	for (i = 0; i < MSGS; i++) {
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	for (i = 0; i < MSGS; i++) {
+		ctx = md5_ctx_mgr_submit(mgr,
+					 &ctxpool[i], msgs[i],
+					 strlen((char *)msgs[i]), HASH_ENTIRE);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			good = expResultDigest[t];
+			checked++;
+			for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the submit."
+				       " Error code: %d", ctx->error);
+				return -1;
+			}
+
+		}
+	}
+
+	while (1) {
+		ctx = md5_ctx_mgr_flush(mgr);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			good = expResultDigest[t];
+			checked++;
+			for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the submit."
+				       " Error code: %d", ctx->error);
+				return -1;
+			}
+		} else {
+			break;
+		}
+	}
+
+	// do larger test in pseudo-random order
+
+	// Init contexts before first use
+	for (i = 0; i < NUM_JOBS; i++) {
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	checked = 0;
+	for (i = 0; i < NUM_JOBS; i++) {
+		j = PSEUDO_RANDOM_NUM(i);
+		ctx = md5_ctx_mgr_submit(mgr,
+					 &ctxpool[i],
+					 msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+			good = expResultDigest[k];
+			checked++;
+			for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the"
+				       " submit. Error code: %d", ctx->error);
+				return -1;
+			}
+
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+		}
+	}
+	while (1) {
+		ctx = md5_ctx_mgr_flush(mgr);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+			good = expResultDigest[k];
+			checked++;
+			for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the submit."
+				       " Error code: %d", ctx->error);
+				return -1;
+			}
+		} else {
+			break;
+		}
+	}
+
+	if (checked != NUM_JOBS) {
+		printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+		return -1;
+	}
+
+	printf(" multibinary_md5 test: Pass\n");
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..0ba50a1d2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c
@@ -0,0 +1,129 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/md5.h>
+#include "md5_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+#  define TEST_LEN     4*1024
+#  define TEST_LOOPS   10000
+#  define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     (GT_L3_CACHE / TEST_BUFS)
+#  define TEST_LOOPS   100
+#  define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * MD5_DIGEST_NWORDS];
+
+int main(void)
+{
+	int ret;
+	MD5_HASH_CTX_MGR *mgr = NULL;
+	MD5_HASH_CTX ctxpool[TEST_BUFS];
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t i, j, t, fail = 0;
+	struct perf start, stop;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+		if (bufs[i] == NULL) {
+			printf("calloc failed test aborted\n");
+			return 1;
+		}
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+	if (ret) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	md5_ctx_mgr_init(mgr);
+
+	// Start OpenSSL tests
+	perf_start(&start);
+	for (t = 0; t < TEST_LOOPS; t++) {
+		for (i = 0; i < TEST_BUFS; i++)
+			MD5(bufs[i], TEST_LEN, digest_ssl[i]);
+	}
+	perf_stop(&stop);
+
+	printf("md5_openssl" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+	// Start mb tests
+	perf_start(&start);
+	for (t = 0; t < TEST_LOOPS; t++) {
+		for (i = 0; i < TEST_BUFS; i++)
+			md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+		while (md5_ctx_mgr_flush(mgr)) ;
+	}
+	perf_stop(&stop);
+
+	printf("multibinary_md5" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_le32(((uint32_t *) digest_ssl[i])[j])) {
+				fail++;
+				printf("Test%d, digest%d fail %08X <=> %08X\n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       to_le32(((uint32_t *) digest_ssl[i])[j]));
+			}
+		}
+	}
+
+	printf("Multi-buffer md5 test complete %d buffers of %d B with "
+	       "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_md5_ossl_perf: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm
new file mode 100644
index 000000000..718572638
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm
@@ -0,0 +1,853 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+
+;; code to compute double octal MD5 using AVX512
+
+;; Stack must be aligned to 64 bytes before call
+
+;; Windows clobbers:  rax rbx     rdx rsi rdi     r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves:         rcx             rbp
+;;
+;; Linux clobbers:    rax rbx rcx rdx rsi         r8 r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves:                       rdi rbp
+;;
+;; clobbers zmm0-8, 14-31
+
+;; clobbers all GPRs other than arg1 and rbp
+
+%ifidn __OUTPUT_FORMAT__, win64
+   %define arg1 rcx	; arg0
+   %define arg2 rdx	; arg1
+   %define reg3 r8	; arg2
+   %define reg4 r9	; arg3
+   %define var1 rdi
+   %define var2 rsi
+   %define local_func_decl(func_name) global func_name
+ %else
+   %define arg1 rdi	; arg0
+   %define arg2 rsi	; arg1
+   %define var1 rdx	; arg2
+   %define var2 rcx	; arg3
+   %define local_func_decl(func_name) mk_global func_name, function, internal
+%endif
+
+%define state    arg1
+%define num_blks arg2
+
+%define	IN	(state + _data_ptr)
+%define DIGEST	state
+%define SIZE	num_blks
+;; These are pointers to data block1 and block2 in the stack
+; which will ping pong back and forth
+%define DPTR1	rbx
+%define DPTR2	var2
+%define IDX	var1
+%define TBL	rax
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+%define inp4 r12
+%define inp5 r13
+%define inp6 r14
+%define inp7 r15
+
+;; Transposed Digest Storage
+%define A	zmm0
+%define B	zmm1
+%define C	zmm2
+%define D	zmm3
+%define A1	zmm4
+%define B1	zmm5
+%define C1	zmm6
+%define D1	zmm7
+
+%define md5c	zmm16
+
+%define MASK0	zmm17
+%define MASK1	zmm18
+
+%define TMP0	zmm20
+%define TMP1	zmm21
+
+
+;; Data are stored into the Wx after transposition
+%define W0	zmm8
+%define W1	zmm9
+%define W2	zmm10
+%define W3	zmm11
+%define W4	zmm12
+%define W5	zmm13
+%define W6	zmm14
+%define W7	zmm15
+
+%define W8	zmm24
+%define W9	zmm25
+%define W10	zmm26
+%define W11	zmm27
+%define W12	zmm28
+%define W13	zmm29
+%define W14	zmm30
+%define W15	zmm31
+
+%define MD5_DIGEST_ROW_SIZE (16*4)
+%define APPEND(a,b) a %+ b
+%define APPEND3(a,b,c) a %+ b %+ c
+
+;; Temporary registers used during data transposition
+
+%define RESZ	resb 64*
+;; Assume stack aligned to 64 bytes before call
+;; Therefore FRAMESIZE mod 64 must be 64-8 = 56
+struc STACK
+_DATA:		RESZ	2*2*16  ; 2 blocks * 2 sets of lanes * 16 regs
+_DIGEST:	RESZ	8	; stores Z_AA-Z_DD, Z_AA2-Z_DD2
+_TMPDIGEST:	RESZ	2	; stores Z_AA, Z_BB temporarily
+_RSP_SAVE:	RESQ    1 	; original RSP
+endstruc
+
+%define	Z_AA	rsp + _DIGEST + 64*0
+%define	Z_BB	rsp + _DIGEST + 64*1
+%define	Z_CC	rsp + _DIGEST + 64*2
+%define	Z_DD	rsp + _DIGEST + 64*3
+%define	Z_AA1	rsp + _DIGEST + 64*4
+%define	Z_BB1	rsp + _DIGEST + 64*5
+%define	Z_CC1	rsp + _DIGEST + 64*6
+%define	Z_DD1	rsp + _DIGEST + 64*7
+
+%define MD5_DIGEST_ROW_SIZE (32*4)
+
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+%define rot11 7
+%define rot12 12
+%define rot13 17
+%define rot14 22
+%define rot21 5
+%define rot22 9
+%define rot23 14
+%define rot24 20
+%define rot31 4
+%define rot32 11
+%define rot33 16
+%define rot34 23
+%define rot41 6
+%define rot42 10
+%define rot43 15
+%define rot44 21
+
+%macro TRANSPOSE16 18
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%t0 %17
+%define %%t1 %18
+
+; r0  = {a15 a14 a13 a12   a11 a10 a9 a8   a7 a6 a5 a4   a3 a2 a1 a0}
+; r1  = {b15 b14 b13 b12   b11 b10 b9 b8   b7 b6 b5 b4   b3 b2 b1 b0}
+; r2  = {c15 c14 c13 c12   c11 c10 c9 c8   c7 c6 c5 c4   c3 c2 c1 c0}
+; r3  = {d15 d14 d13 d12   d11 d10 d9 d8   d7 d6 d5 d4   d3 d2 d1 d0}
+; r4  = {e15 e14 e13 e12   e11 e10 e9 e8   e7 e6 e5 e4   e3 e2 e1 e0}
+; r5  = {f15 f14 f13 f12   f11 f10 f9 f8   f7 f6 f5 f4   f3 f2 f1 f0}
+; r6  = {g15 g14 g13 g12   g11 g10 g9 g8   g7 g6 g5 g4   g3 g2 g1 g0}
+; r7  = {h15 h14 h13 h12   h11 h10 h9 h8   h7 h6 h5 h4   h3 h2 h1 h0}
+; r8  = {i15 i14 i13 i12   i11 i10 i9 i8   i7 i6 i5 i4   i3 i2 i1 i0}
+; r9  = {j15 j14 j13 j12   j11 j10 j9 j8   j7 j6 j5 j4   j3 j2 j1 j0}
+; r10 = {k15 k14 k13 k12   k11 k10 k9 k8   k7 k6 k5 k4   k3 k2 k1 k0}
+; r11 = {l15 l14 l13 l12   l11 l10 l9 l8   l7 l6 l5 l4   l3 l2 l1 l0}
+; r12 = {m15 m14 m13 m12   m11 m10 m9 m8   m7 m6 m5 m4   m3 m2 m1 m0}
+; r13 = {n15 n14 n13 n12   n11 n10 n9 n8   n7 n6 n5 n4   n3 n2 n1 n0}
+; r14 = {o15 o14 o13 o12   o11 o10 o9 o8   o7 o6 o5 o4   o3 o2 o1 o0}
+; r15 = {p15 p14 p13 p12   p11 p10 p9 p8   p7 p6 p5 p4   p3 p2 p1 p0}
+
+; r0   = {p0  o0  n0  m0    l0  k0  j0  i0    h0  g0  f0  e0    d0  c0  b0  a0}
+; r1   = {p1  o1  n1  m1    l1  k1  j1  i1    h1  g1  f1  e1    d1  c1  b1  a1}
+; r2   = {p2  o2  n2  m2    l2  k2  j2  i2    h2  g2  f2  e2    d2  c2  b2  a2}
+; r3   = {p3  o3  n3  m3    l3  k3  j3  i3    h3  g3  f3  e3    d3  c3  b3  a3}
+; r4   = {p4  o4  n4  m4    l4  k4  j4  i4    h4  g4  f4  e4    d4  c4  b4  a4}
+; r5   = {p5  o5  n5  m5    l5  k5  j5  i5    h5  g5  f5  e5    d5  c5  b5  a5}
+; r6   = {p6  o6  n6  m6    l6  k6  j6  i6    h6  g6  f6  e6    d6  c6  b6  a6}
+; r7   = {p7  o7  n7  m7    l7  k7  j7  i7    h7  g7  f7  e7    d7  c7  b7  a7}
+; r8   = {p8  o8  n8  m8    l8  k8  j8  i8    h8  g8  f8  e8    d8  c8  b8  a8}
+; r9   = {p9  o9  n9  m9    l9  k9  j9  i9    h9  g9  f9  e9    d9  c9  b9  a9}
+; r10  = {p10 o10 n10 m10   l10 k10 j10 i10   h10 g10 f10 e10   d10 c10 b10 a10}
+; r11  = {p11 o11 n11 m11   l11 k11 j11 i11   h11 g11 f11 e11   d11 c11 b11 a11}
+; r12  = {p12 o12 n12 m12   l12 k12 j12 i12   h12 g12 f12 e12   d12 c12 b12 a12}
+; r13  = {p13 o13 n13 m13   l13 k13 j13 i13   h13 g13 f13 e13   d13 c13 b13 a13}
+; r14  = {p14 o14 n14 m14   l14 k14 j14 i14   h14 g14 f14 e14   d14 c14 b14 a14}
+; r15  = {p15 o15 n15 m15   l15 k15 j15 i15   h15 g15 f15 e15   d15 c15 b15 a15}
+
+
+	; process top half (r0..r3) {a...d}
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {b13 b12 a13 a12   b9  b8  a9  a8   b5 b4 a5 a4   b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {b15 b14 a15 a14   b11 b10 a11 a10  b7 b6 a7 a6   b3 b2 a3 a2}
+	vshufps	%%t1, %%r2, %%r3, 0x44	; t1 = {d13 d12 c13 c12   d9  d8  c9  c8   d5 d4 c5 c4   d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {d15 d14 c15 c14   d11 d10 c11 c10  d7 d6 c7 c6   d3 d2 c3 c2}
+
+	vshufps	%%r3, %%t0, %%t1, 0xDD	; r3 = {d13 c13 b13 a13   d9  c9  b9  a9   d5 c5 b5 a5   d1 c1 b1 a1}
+	vshufps	%%r1, %%r0, %%r2, 0x88	; r1 = {d14 c14 b14 a14   d10 c10 b10 a10  d6 c6 b6 a6   d2 c2 b2 a2}
+	vshufps	%%r0, %%r0, %%r2, 0xDD	; r0 = {d15 c15 b15 a15   d11 c11 b11 a11  d7 c7 b7 a7   d3 c3 b3 a3}
+	vshufps	%%t0, %%t0, %%t1, 0x88	; t0 = {d12 c12 b12 a12   d8  c8  b8  a8   d4 c4 b4 a4   d0 c0 b0 a0}
+
+	; use r2 in place of t0
+	vshufps	%%r2, %%r4, %%r5, 0x44	; r2 = {f13 f12 e13 e12   f9  f8  e9  e8   f5 f4 e5 e4   f1 f0 e1 e0}
+	vshufps	%%r4, %%r4, %%r5, 0xEE	; r4 = {f15 f14 e15 e14   f11 f10 e11 e10  f7 f6 e7 e6   f3 f2 e3 e2}
+	vshufps %%t1, %%r6, %%r7, 0x44	; t1 = {h13 h12 g13 g12   h9  h8  g9  g8   h5 h4 g5 g4   h1 h0 g1 g0}
+	vshufps	%%r6, %%r6, %%r7, 0xEE	; r6 = {h15 h14 g15 g14   h11 h10 g11 g10  h7 h6 g7 g6   h3 h2 g3 g2}
+
+	vshufps	%%r7, %%r2, %%t1, 0xDD	; r7 = {h13 g13 f13 e13   h9  g9  f9  e9   h5 g5 f5 e5   h1 g1 f1 e1}
+	vshufps	%%r5, %%r4, %%r6, 0x88	; r5 = {h14 g14 f14 e14   h10 g10 f10 e10  h6 g6 f6 e6   h2 g2 f2 e2}
+	vshufps	%%r4, %%r4, %%r6, 0xDD	; r4 = {h15 g15 f15 e15   h11 g11 f11 e11  h7 g7 f7 e7   h3 g3 f3 e3}
+	vshufps	%%r2, %%r2, %%t1, 0x88	; r2 = {h12 g12 f12 e12   h8  g8  f8  e8   h4 g4 f4 e4   h0 g0 f0 e0}
+
+	; use r6 in place of t0
+	vshufps	%%r6, %%r8, %%r9,    0x44	; r6  = {j13 j12 i13 i12   j9  j8  i9  i8   j5 j4 i5 i4   j1 j0 i1 i0}
+	vshufps	%%r8, %%r8, %%r9,    0xEE	; r8  = {j15 j14 i15 i14   j11 j10 i11 i10  j7 j6 i7 i6   j3 j2 i3 i2}
+	vshufps	%%t1, %%r10, %%r11,  0x44	; t1  = {l13 l12 k13 k12   l9  l8  k9  k8   l5 l4 k5 k4   l1 l0 k1 k0}
+	vshufps	%%r10, %%r10, %%r11, 0xEE	; r10 = {l15 l14 k15 k14   l11 l10 k11 k10  l7 l6 k7 k6   l3 l2 k3 k2}
+
+	vshufps	%%r11, %%r6, %%t1, 0xDD		; r11 = {l13 k13 j13 113   l9  k9  j9  i9   l5 k5 j5 i5   l1 k1 j1 i1}
+	vshufps	%%r9, %%r8, %%r10, 0x88		; r9  = {l14 k14 j14 114   l10 k10 j10 i10  l6 k6 j6 i6   l2 k2 j2 i2}
+	vshufps	%%r8, %%r8, %%r10, 0xDD		; r8  = {l15 k15 j15 115   l11 k11 j11 i11  l7 k7 j7 i7   l3 k3 j3 i3}
+	vshufps	%%r6, %%r6, %%t1,  0x88		; r6  = {l12 k12 j12 112   l8  k8  j8  i8   l4 k4 j4 i4   l0 k0 j0 i0}
+
+	; use r10 in place of t0
+	vshufps	%%r10, %%r12, %%r13, 0x44	; r10 = {n13 n12 m13 m12   n9  n8  m9  m8   n5 n4 m5 m4   n1 n0 a1 m0}
+	vshufps	%%r12, %%r12, %%r13, 0xEE	; r12 = {n15 n14 m15 m14   n11 n10 m11 m10  n7 n6 m7 m6   n3 n2 a3 m2}
+	vshufps	%%t1, %%r14, %%r15,  0x44	; t1  = {p13 p12 013 012   p9  p8  09  08   p5 p4 05 04   p1 p0 01 00}
+	vshufps	%%r14, %%r14, %%r15, 0xEE	; r14 = {p15 p14 015 014   p11 p10 011 010  p7 p6 07 06   p3 p2 03 02}
+
+	vshufps	%%r15, %%r10, %%t1,  0xDD	; r15 = {p13 013 n13 m13   p9  09  n9  m9   p5 05 n5 m5   p1 01 n1 m1}
+	vshufps	%%r13, %%r12, %%r14, 0x88	; r13 = {p14 014 n14 m14   p10 010 n10 m10  p6 06 n6 m6   p2 02 n2 m2}
+	vshufps	%%r12, %%r12, %%r14, 0xDD	; r12 = {p15 015 n15 m15   p11 011 n11 m11  p7 07 n7 m7   p3 03 n3 m3}
+	vshufps	%%r10, %%r10, %%t1,  0x88	; r10 = {p12 012 n12 m12   p8  08  n8  m8   p4 04 n4 m4   p0 00 n0 m0}
+
+;; At this point, the registers that contain interesting data are:
+;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12
+;; Can use t1 and r14 as scratch registers
+
+	vmovdqa32 %%r14, MASK0
+	vpermi2q  %%r14, %%t0, %%r2		; r14 = {h8  g8  f8  e8   d8  c8  b8  a8   h0 g0 f0 e0	 d0 c0 b0 a0}
+	vmovdqa32 %%t1,  MASK1
+	vpermi2q  %%t1,  %%t0, %%r2		; t1  = {h12 g12 f12 e12  d12 c12 b12 a12  h4 g4 f4 e4	 d4 c4 b4 a4}
+
+	vmovdqa32 %%r2, MASK0
+	vpermi2q  %%r2, %%r3, %%r7		; r2  = {h9  g9  f9  e9   d9  c9  b9  a9   h1 g1 f1 e1	 d1 c1 b1 a1}
+	vmovdqa32 %%t0, MASK1
+	vpermi2q  %%t0, %%r3, %%r7		; t0  = {h13 g13 f13 e13  d13 c13 b13 a13  h5 g5 f5 e5	 d5 c5 b5 a5}
+
+	vmovdqa32 %%r3, MASK0
+	vpermi2q  %%r3, %%r1, %%r5		; r3  = {h10 g10 f10 e10  d10 c10 b10 a10  h2 g2 f2 e2	 d2 c2 b2 a2}
+	vmovdqa32 %%r7, MASK1
+	vpermi2q  %%r7, %%r1, %%r5		; r7  = {h14 g14 f14 e14  d14 c14 b14 a14  h6 g6 f6 e6	 d6 c6 b6 a6}
+
+	vmovdqa32 %%r1, MASK0
+	vpermi2q  %%r1, %%r0, %%r4		; r1  = {h11 g11 f11 e11  d11 c11 b11 a11  h3 g3 f3 e3	 d3 c3 b3 a3}
+	vmovdqa32 %%r5, MASK1
+	vpermi2q  %%r5, %%r0, %%r4		; r5  = {h15 g15 f15 e15  d15 c15 b15 a15  h7 g7 f7 e7	 d7 c7 b7 a7}
+
+	vmovdqa32 %%r0, MASK0
+	vpermi2q  %%r0, %%r6, %%r10		; r0 = {p8  o8  n8  m8   l8  k8  j8  i8   p0 o0 n0 m0	 l0 k0 j0 i0}
+	vmovdqa32 %%r4,  MASK1
+	vpermi2q  %%r4, %%r6, %%r10		; r4  = {p12 o12 n12 m12  l12 k12 j12 i12  p4 o4 n4 m4	 l4 k4 j4 i4}
+
+	vmovdqa32 %%r6, MASK0
+	vpermi2q  %%r6, %%r11, %%r15		; r6  = {p9  o9  n9  m9   l9  k9  j9  i9   p1 o1 n1 m1	 l1 k1 j1 i1}
+	vmovdqa32 %%r10, MASK1
+	vpermi2q  %%r10, %%r11, %%r15		; r10 = {p13 o13 n13 m13  l13 k13 j13 i13  p5 o5 n5 m5	 l5 k5 j5 i5}
+
+	vmovdqa32 %%r11, MASK0
+	vpermi2q  %%r11, %%r9, %%r13		; r11 = {p10 o10 n10 m10  l10 k10 j10 i10  p2 o2 n2 m2	 l2 k2 j2 i2}
+	vmovdqa32 %%r15, MASK1
+	vpermi2q  %%r15, %%r9, %%r13		; r15 = {p14 o14 n14 m14  l14 k14 j14 i14  p6 o6 n6 m6	 l6 k6 j6 i6}
+
+	vmovdqa32 %%r9, MASK0
+	vpermi2q  %%r9, %%r8, %%r12		; r9  = {p11 o11 n11 m11  l11 k11 j11 i11  p3 o3 n3 m3	 l3 k3 j3 i3}
+	vmovdqa32 %%r13, MASK1
+	vpermi2q  %%r13, %%r8, %%r12		; r13 = {p15 o15 n15 m15  l15 k15 j15 i15  p7 o7 n7 m7	 l7 k7 j7 i7}
+
+;; At this point r8 and r12 can be used as scratch registers
+
+	vshuff64x2 %%r8, %%r14, %%r0, 0xEE 	; r8  = {p8  o8  n8  m8   l8  k8  j8  i8   h8 g8 f8 e8   d8 c8 b8 a8}
+	vshuff64x2 %%r0, %%r14, %%r0, 0x44 	; r0  = {p0  o0  n0  m0   l0  k0  j0  i0   h0 g0 f0 e0   d0 c0 b0 a0}
+
+	vshuff64x2 %%r12, %%t1, %%r4, 0xEE 	; r12 = {p12 o12 n12 m12  l12 k12 j12 i12  h12 g12 f12 e12  d12 c12 b12 a12}
+	vshuff64x2 %%r4, %%t1, %%r4, 0x44 	; r4  = {p4  o4  n4  m4   l4  k4  j4  i4   h4 g4 f4 e4   d4 c4 b4 a4}
+
+	vshuff64x2 %%r14, %%r7, %%r15, 0xEE 	; r14 = {p14 o14 n14 m14  l14 k14 j14 i14  h14 g14 f14 e14  d14 c14 b14 a14}
+	vshuff64x2 %%t1, %%r7, %%r15, 0x44 	; t1  = {p6  o6  n6  m6   l6  k6  j6  i6   h6 g6 f6 e6   d6 c6 b6 a6}
+
+	vshuff64x2 %%r15, %%r5, %%r13, 0xEE 	; r15 = {p15 o15 n15 m15  l15 k15 j15 i15  h15 g15 f15 e15  d15 c15 b15 a15}
+	vshuff64x2 %%r7, %%r5, %%r13, 0x44 	; r7  = {p7  o7  n7  m7   l7  k7  j7  i7   h7 g7 f7 e7   d7 c7 b7 a7}
+
+	vshuff64x2 %%r13, %%t0, %%r10, 0xEE 	; r13 = {p13 o13 n13 m13  l13 k13 j13 i13  h13 g13 f13 e13  d13 c13 b13 a13}
+	vshuff64x2 %%r5, %%t0, %%r10, 0x44 	; r5  = {p5  o5  n5  m5   l5  k5  j5  i5   h5 g5 f5 e5   d5 c5 b5 a5}
+
+	vshuff64x2 %%r10, %%r3, %%r11, 0xEE 	; r10 = {p10 o10 n10 m10  l10 k10 j10 i10  h10 g10 f10 e10  d10 c10 b10 a10}
+	vshuff64x2 %%t0, %%r3, %%r11, 0x44 	; t0  = {p2  o2  n2  m2   l2  k2  j2  i2   h2 g2 f2 e2   d2 c2 b2 a2}
+
+	vshuff64x2 %%r11, %%r1, %%r9, 0xEE 	; r11 = {p11 o11 n11 m11  l11 k11 j11 i11  h11 g11 f11 e11  d11 c11 b11 a11}
+	vshuff64x2 %%r3, %%r1, %%r9, 0x44 	; r3  = {p3  o3  n3  m3   l3  k3  j3  i3   h3 g3 f3 e3   d3 c3 b3 a3}
+
+	vshuff64x2 %%r9, %%r2, %%r6, 0xEE 	; r9  = {p9  o9  n9  m9   l9  k9  j9  i9   h9 g9 f9 e9   d9 c9 b9 a9}
+	vshuff64x2 %%r1, %%r2, %%r6, 0x44 	; r1  = {p1  o1  n1  m1   l1  k1  j1  i1   h1 g1 f1 e1   d1 c1 b1 a1}
+
+	vmovdqa32 %%r2, %%t0			; r2  = {p2  o2  n2  m2   l2  k2  j2  i2   h2 g2 f2 e2   d2 c2 b2 a2}
+	vmovdqa32 %%r6, %%t1			; r6  = {p6  o6  n6  m6   l6  k6  j6  i6   h6 g6 f6 e6   d6 c6 b6 a6}
+
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_ARGS1 0
+%xdefine TMP_ D1
+%xdefine D1 C1
+%xdefine C1 B1
+%xdefine B1 A1
+%xdefine A1 TMP_
+%endm
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +Ft(B,C,D) +data +const), nrot)
+;;eg: PROCESS_LOOP MD5constx, Mdatax, F_IMMEDx, NROTx
+%macro PROCESS_LOOP 6
+%define %%MD5const	%1
+%define %%data		%2
+%define %%F_IMMED	%3
+%define %%NROT		%4
+%define %%TMP_PR0	%5
+%define %%TMP_PR1	%6
+	; a=b+((a+Ft(b,c,d)+Mj+ti)<<s)
+
+	; Ft
+	;  0-15 Ft:F(X,Y,Z)=(X&Y)|((~X)&Z)	0xca
+	; 16-31 Ft:G(X,Y,Z)=(X&Z)|(Y&(~Z))	0xe4
+	; 32-47 Ft:H(X,Y,Z)=X^Y^Z		0x96
+	; 48-63 Ft:I(X,Y,Z)=Y^(X|(~Z))		0x39
+
+	vpaddd		A, A, %%MD5const
+		vpaddd		A1, A1, %%MD5const
+	vpaddd		A, A, [%%data]
+		vpaddd		A1, A1, [%%data + 16*64]
+	vmovdqa32	%%TMP_PR0, B		; Copy B
+		vmovdqa32	%%TMP_PR1, B1		; Copy B
+	vpternlogd	%%TMP_PR0, C, D, %%F_IMMED
+		vpternlogd	%%TMP_PR1, C1, D1, %%F_IMMED
+	vpaddd		A, A, %%TMP_PR0
+		vpaddd		A1, A1, %%TMP_PR1
+	vprold		A, A, %%NROT
+		vprold		A1, A1, %%NROT
+	vpaddd		A, A, B
+		vpaddd		A1, A1, B1
+
+	ROTATE_ARGS
+	ROTATE_ARGS1
+%endmacro
+
+align 64
+
+; void md5_mb_x16x2_avx512(MD5_ARGS *args, UINT64 num_blks)
+; arg 1 : pointer to MD5_ARGS structure
+; arg 2 : number of blocks (>=1)
+
+local_func_decl(md5_mb_x16x2_avx512)
+md5_mb_x16x2_avx512:
+	endbranch
+	mov	rax, rsp
+	sub	rsp, STACK_size
+	and	rsp, -64
+	mov	[rsp + _RSP_SAVE], rax
+
+	mov	DPTR1, rsp
+	lea	DPTR2, [rsp + 64*32]
+
+	;; Load MD5 constant pointer to register
+	lea	TBL, [MD5_TABLE]
+	vmovdqa32 MASK0, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vmovdqa32 MASK1, [PSHUFFLE_TRANSPOSE16_MASK2]
+
+	;; Preload input data from 16 segments.
+	xor IDX, IDX
+
+	;; transpose input onto stack
+	;; first 16 lanes read
+	mov	inp0, [IN + 0*8]
+	mov	inp1, [IN + 1*8]
+	mov	inp2, [IN + 2*8]
+	mov	inp3, [IN + 3*8]
+	mov	inp4, [IN + 4*8]
+	mov	inp5, [IN + 5*8]
+	mov	inp6, [IN + 6*8]
+	mov	inp7, [IN + 7*8]
+	vmovdqu32	W0,[inp0+IDX]
+	vmovdqu32	W1,[inp1+IDX]
+	vmovdqu32	W2,[inp2+IDX]
+	vmovdqu32	W3,[inp3+IDX]
+	vmovdqu32	W4,[inp4+IDX]
+	vmovdqu32	W5,[inp5+IDX]
+	vmovdqu32	W6,[inp6+IDX]
+	vmovdqu32	W7,[inp7+IDX]
+	mov	inp0, [IN + 8*8]
+	mov	inp1, [IN + 9*8]
+	mov	inp2, [IN +10*8]
+	mov	inp3, [IN +11*8]
+	mov	inp4, [IN +12*8]
+	mov	inp5, [IN +13*8]
+	mov	inp6, [IN +14*8]
+	mov	inp7, [IN +15*8]
+	vmovdqu32	W8, [inp0+IDX]
+	vmovdqu32	W9, [inp1+IDX]
+	vmovdqu32	W10,[inp2+IDX]
+	vmovdqu32	W11,[inp3+IDX]
+	vmovdqu32	W12,[inp4+IDX]
+	vmovdqu32	W13,[inp5+IDX]
+	vmovdqu32	W14,[inp6+IDX]
+	vmovdqu32	W15,[inp7+IDX]
+	;; first 16 lanes trans&write
+	TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+	vmovdqa32	[DPTR1+_DATA+(0)*64],W0
+	vmovdqa32	[DPTR1+_DATA+(1)*64],W1
+	vmovdqa32	[DPTR1+_DATA+(2)*64],W2
+	vmovdqa32	[DPTR1+_DATA+(3)*64],W3
+	vmovdqa32	[DPTR1+_DATA+(4)*64],W4
+	vmovdqa32	[DPTR1+_DATA+(5)*64],W5
+	vmovdqa32	[DPTR1+_DATA+(6)*64],W6
+	vmovdqa32	[DPTR1+_DATA+(7)*64],W7
+	vmovdqa32	[DPTR1+_DATA+(8)*64],W8
+	vmovdqa32	[DPTR1+_DATA+(9)*64],W9
+	vmovdqa32	[DPTR1+_DATA+(10)*64],W10
+	vmovdqa32	[DPTR1+_DATA+(11)*64],W11
+	vmovdqa32	[DPTR1+_DATA+(12)*64],W12
+	vmovdqa32	[DPTR1+_DATA+(13)*64],W13
+	vmovdqa32	[DPTR1+_DATA+(14)*64],W14
+	vmovdqa32	[DPTR1+_DATA+(15)*64],W15
+
+	;; second 16 lanes read
+	mov	inp0, [IN + 16*8]
+	mov	inp1, [IN + 17*8]
+	mov	inp2, [IN + 18*8]
+	mov	inp3, [IN + 19*8]
+	mov	inp4, [IN + 20*8]
+	mov	inp5, [IN + 21*8]
+	mov	inp6, [IN + 22*8]
+	mov	inp7, [IN + 23*8]
+	vmovdqu32	W0,[inp0+IDX]
+	vmovdqu32	W1,[inp1+IDX]
+	vmovdqu32	W2,[inp2+IDX]
+	vmovdqu32	W3,[inp3+IDX]
+	vmovdqu32	W4,[inp4+IDX]
+	vmovdqu32	W5,[inp5+IDX]
+	vmovdqu32	W6,[inp6+IDX]
+	vmovdqu32	W7,[inp7+IDX]
+	mov	inp0, [IN + 24*8]
+	mov	inp1, [IN + 25*8]
+	mov	inp2, [IN + 26*8]
+	mov	inp3, [IN + 27*8]
+	mov	inp4, [IN + 28*8]
+	mov	inp5, [IN + 29*8]
+	mov	inp6, [IN + 30*8]
+	mov	inp7, [IN + 31*8]
+	vmovdqu32	W8, [inp0+IDX]
+	vmovdqu32	W9, [inp1+IDX]
+	vmovdqu32	W10,[inp2+IDX]
+	vmovdqu32	W11,[inp3+IDX]
+	vmovdqu32	W12,[inp4+IDX]
+	vmovdqu32	W13,[inp5+IDX]
+	vmovdqu32	W14,[inp6+IDX]
+	vmovdqu32	W15,[inp7+IDX]
+	;; second 16 lanes trans&write
+	TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+	vmovdqa32	[DPTR1+_DATA+(16+0)*64],W0
+	vmovdqa32	[DPTR1+_DATA+(16+1)*64],W1
+	vmovdqa32	[DPTR1+_DATA+(16+2)*64],W2
+	vmovdqa32	[DPTR1+_DATA+(16+3)*64],W3
+	vmovdqa32	[DPTR1+_DATA+(16+4)*64],W4
+	vmovdqa32	[DPTR1+_DATA+(16+5)*64],W5
+	vmovdqa32	[DPTR1+_DATA+(16+6)*64],W6
+	vmovdqa32	[DPTR1+_DATA+(16+7)*64],W7
+	vmovdqa32	[DPTR1+_DATA+(16+8)*64],W8
+	vmovdqa32	[DPTR1+_DATA+(16+9)*64],W9
+	vmovdqa32	[DPTR1+_DATA+(16+10)*64],W10
+	vmovdqa32	[DPTR1+_DATA+(16+11)*64],W11
+	vmovdqa32	[DPTR1+_DATA+(16+12)*64],W12
+	vmovdqa32	[DPTR1+_DATA+(16+13)*64],W13
+	vmovdqa32	[DPTR1+_DATA+(16+14)*64],W14
+	vmovdqa32	[DPTR1+_DATA+(16+15)*64],W15
+
+	;; Initialize digests
+	;; vmovdqu32 replace vmovdqa32
+	vmovdqu32	A, [DIGEST + 0 * MD5_DIGEST_ROW_SIZE]
+	vmovdqu32	B, [DIGEST + 1 * MD5_DIGEST_ROW_SIZE]
+	vmovdqu32	C, [DIGEST + 2 * MD5_DIGEST_ROW_SIZE]
+	vmovdqu32	D, [DIGEST + 3 * MD5_DIGEST_ROW_SIZE]
+	; Load the digest for each stream (9-16)
+	vmovdqu32	A1,[DIGEST + 0 * MD5_DIGEST_ROW_SIZE + 64]
+	vmovdqu32	B1,[DIGEST + 1 * MD5_DIGEST_ROW_SIZE + 64]
+	vmovdqu32	C1,[DIGEST + 2 * MD5_DIGEST_ROW_SIZE + 64]
+	vmovdqu32	D1,[DIGEST + 3 * MD5_DIGEST_ROW_SIZE + 64]
+
+.lloop:
+	;; Increment IDX to point to next data block (64 bytes per block)
+	add	IDX, 64
+
+	; Save digests for later addition
+	vmovdqa32	[Z_AA], A
+	vmovdqa32	[Z_BB], B
+	vmovdqa32	[Z_CC], C
+	vmovdqa32	[Z_DD], D
+	vmovdqa32	[Z_AA1], A1
+	vmovdqa32	[Z_BB1], B1
+	vmovdqa32	[Z_CC1], C1
+	vmovdqa32	[Z_DD1], D1
+
+	sub	SIZE, 1
+	je	.LastLoop
+
+%assign I 	0
+%assign I_fimm  0xCA
+%rep 16		; 0<=I<=15
+ %assign I_rotX	I/16+1
+ %assign I_rotY	(I % 4 + 1)
+ %assign I_data	I
+	vpbroadcastd md5c, [TBL + I * 4]
+	PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+	;; first 16 lanes read
+	mov	inp0, [IN + 0*8]
+	mov	inp1, [IN + 1*8]
+	mov	inp2, [IN + 2*8]
+	mov	inp3, [IN + 3*8]
+	mov	inp4, [IN + 4*8]
+	mov	inp5, [IN + 5*8]
+	mov	inp6, [IN + 6*8]
+	mov	inp7, [IN + 7*8]
+	vmovdqu32	W0,[inp0+IDX]
+	vmovdqu32	W1,[inp1+IDX]
+	vmovdqu32	W2,[inp2+IDX]
+	vmovdqu32	W3,[inp3+IDX]
+	vmovdqu32	W4,[inp4+IDX]
+	vmovdqu32	W5,[inp5+IDX]
+	vmovdqu32	W6,[inp6+IDX]
+	vmovdqu32	W7,[inp7+IDX]
+	mov	inp0, [IN + 8*8]
+	mov	inp1, [IN + 9*8]
+	mov	inp2, [IN +10*8]
+	mov	inp3, [IN +11*8]
+	mov	inp4, [IN +12*8]
+	mov	inp5, [IN +13*8]
+	mov	inp6, [IN +14*8]
+	mov	inp7, [IN +15*8]
+	vmovdqu32	W8, [inp0+IDX]
+	vmovdqu32	W9, [inp1+IDX]
+	vmovdqu32	W10,[inp2+IDX]
+	vmovdqu32	W11,[inp3+IDX]
+	vmovdqu32	W12,[inp4+IDX]
+	vmovdqu32	W13,[inp5+IDX]
+	vmovdqu32	W14,[inp6+IDX]
+	vmovdqu32	W15,[inp7+IDX]
+
+%assign I 	16
+%assign I_fimm  0xE4
+%rep 16		; 16<=I<=31
+ %assign I_data	((5*I+1) % 16)
+ %assign I_rotX	I/16+1
+ %assign I_rotY	(I % 4 + 1)
+	vpbroadcastd md5c, [TBL + I * 4]
+	PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+	;; first 16 lanes trans&write
+	TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+	vmovdqa32	[DPTR2+_DATA+(0)*64],W0
+	vmovdqa32	[DPTR2+_DATA+(1)*64],W1
+	vmovdqa32	[DPTR2+_DATA+(2)*64],W2
+	vmovdqa32	[DPTR2+_DATA+(3)*64],W3
+	vmovdqa32	[DPTR2+_DATA+(4)*64],W4
+	vmovdqa32	[DPTR2+_DATA+(5)*64],W5
+	vmovdqa32	[DPTR2+_DATA+(6)*64],W6
+	vmovdqa32	[DPTR2+_DATA+(7)*64],W7
+	vmovdqa32	[DPTR2+_DATA+(8)*64],W8
+	vmovdqa32	[DPTR2+_DATA+(9)*64],W9
+	vmovdqa32	[DPTR2+_DATA+(10)*64],W10
+	vmovdqa32	[DPTR2+_DATA+(11)*64],W11
+	vmovdqa32	[DPTR2+_DATA+(12)*64],W12
+	vmovdqa32	[DPTR2+_DATA+(13)*64],W13
+	vmovdqa32	[DPTR2+_DATA+(14)*64],W14
+	vmovdqa32	[DPTR2+_DATA+(15)*64],W15
+
+%assign I 	32
+%assign I_fimm  0x96
+%rep 16		; 32<=I<=47
+ %assign I_data	((3*I+5) % 16)
+ %assign I_rotX	I/16+1
+ %assign I_rotY	(I % 4 + 1)
+	vpbroadcastd md5c, [TBL + I * 4]
+	PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+	;; second 16 lanes read
+	mov	inp0, [IN + 16*8]
+	mov	inp1, [IN + 17*8]
+	mov	inp2, [IN + 18*8]
+	mov	inp3, [IN + 19*8]
+	mov	inp4, [IN + 20*8]
+	mov	inp5, [IN + 21*8]
+	mov	inp6, [IN + 22*8]
+	mov	inp7, [IN + 23*8]
+	vmovdqu32	W0,[inp0+IDX]
+	vmovdqu32	W1,[inp1+IDX]
+	vmovdqu32	W2,[inp2+IDX]
+	vmovdqu32	W3,[inp3+IDX]
+	vmovdqu32	W4,[inp4+IDX]
+	vmovdqu32	W5,[inp5+IDX]
+	vmovdqu32	W6,[inp6+IDX]
+	vmovdqu32	W7,[inp7+IDX]
+	mov	inp0, [IN + 24*8]
+	mov	inp1, [IN + 25*8]
+	mov	inp2, [IN + 26*8]
+	mov	inp3, [IN + 27*8]
+	mov	inp4, [IN + 28*8]
+	mov	inp5, [IN + 29*8]
+	mov	inp6, [IN + 30*8]
+	mov	inp7, [IN + 31*8]
+	vmovdqu32	W8, [inp0+IDX]
+	vmovdqu32	W9, [inp1+IDX]
+	vmovdqu32	W10,[inp2+IDX]
+	vmovdqu32	W11,[inp3+IDX]
+	vmovdqu32	W12,[inp4+IDX]
+	vmovdqu32	W13,[inp5+IDX]
+	vmovdqu32	W14,[inp6+IDX]
+	vmovdqu32	W15,[inp7+IDX]
+
+%assign I 	48
+%assign I_fimm  0x39
+%rep 16	; 48<=I<=63
+ %assign I_rotX	(I/16+1)
+ %assign I_rotY	(I % 4 + 1)
+ %assign I_data	((7*I) % 16)
+	vpbroadcastd md5c, [TBL + I * 4]
+	PROCESS_LOOP md5c,  DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+	;; second 16 lanes trans&write
+	TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+	vmovdqa32	[DPTR2+_DATA+(16+0)*64],W0
+	vmovdqa32	[DPTR2+_DATA+(16+1)*64],W1
+	vmovdqa32	[DPTR2+_DATA+(16+2)*64],W2
+	vmovdqa32	[DPTR2+_DATA+(16+3)*64],W3
+	vmovdqa32	[DPTR2+_DATA+(16+4)*64],W4
+	vmovdqa32	[DPTR2+_DATA+(16+5)*64],W5
+	vmovdqa32	[DPTR2+_DATA+(16+6)*64],W6
+	vmovdqa32	[DPTR2+_DATA+(16+7)*64],W7
+	vmovdqa32	[DPTR2+_DATA+(16+8)*64],W8
+	vmovdqa32	[DPTR2+_DATA+(16+9)*64],W9
+	vmovdqa32	[DPTR2+_DATA+(16+10)*64],W10
+	vmovdqa32	[DPTR2+_DATA+(16+11)*64],W11
+	vmovdqa32	[DPTR2+_DATA+(16+12)*64],W12
+	vmovdqa32	[DPTR2+_DATA+(16+13)*64],W13
+	vmovdqa32	[DPTR2+_DATA+(16+14)*64],W14
+	vmovdqa32	[DPTR2+_DATA+(16+15)*64],W15
+
+	; Add old digest
+	vpaddd		A,A,[Z_AA]
+	vpaddd		B,B,[Z_BB]
+	vpaddd		C,C,[Z_CC]
+	vpaddd		D,D,[Z_DD]
+	vpaddd		A1,A1,[Z_AA1]
+	vpaddd		B1,B1,[Z_BB1]
+	vpaddd		C1,C1,[Z_CC1]
+	vpaddd		D1,D1,[Z_DD1]
+
+	; Swap DPTR1 and DPTR2
+	xchg	DPTR1, DPTR2
+	 ;; Proceed to processing of next block
+	jmp 	.lloop
+
+.LastLoop:
+%assign I 	0
+%assign I_fimm  0xCA
+%rep 16		; 0<=I<=15
+ %assign I_rotX	I/16+1
+ %assign I_rotY	(I % 4 + 1)
+ %assign I_data	I
+	vpbroadcastd md5c, [TBL + I * 4]
+	PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+%assign I 	16
+%assign I_fimm  0xE4
+%rep 16		; 16<=I<=31
+ %assign I_data	((5*I+1) % 16)
+ %assign I_rotX	I/16+1
+ %assign I_rotY	(I % 4 + 1)
+	vpbroadcastd md5c, [TBL + I * 4]
+	PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+%assign I 	32
+%assign I_fimm  0x96
+%rep 16		; 32<=I<=47
+ %assign I_data	((3*I+5) % 16)
+ %assign I_rotX	I/16+1
+ %assign I_rotY	(I % 4 + 1)
+	vpbroadcastd md5c, [TBL + I * 4]
+	PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+%assign I 	48
+%assign I_fimm  0x39
+%rep 16	; 48<=I<=63
+ %assign I_rotX	(I/16+1)
+ %assign I_rotY	(I % 4 + 1)
+ %assign I_data	((7*I) % 16)
+	vpbroadcastd md5c, [TBL + I * 4]
+	PROCESS_LOOP md5c,  DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+	; Add old digest
+	vpaddd		A,A,[Z_AA]
+	vpaddd		B,B,[Z_BB]
+	vpaddd		C,C,[Z_CC]
+	vpaddd		D,D,[Z_DD]
+	vpaddd		A1,A1,[Z_AA1]
+	vpaddd		B1,B1,[Z_BB1]
+	vpaddd		C1,C1,[Z_CC1]
+	vpaddd		D1,D1,[Z_DD1]
+
+	;; update into data pointers
+%assign I 0
+%rep 16
+	mov    inp0, [IN + (2*I)*8]
+	mov    inp1, [IN + (2*I +1)*8]
+	add    inp0, IDX
+	add    inp1, IDX
+	mov    [IN + (2*I)*8], inp0
+	mov    [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+	vmovdqu32	[DIGEST + 0*MD5_DIGEST_ROW_SIZE  ], A
+	vmovdqu32	[DIGEST + 1*MD5_DIGEST_ROW_SIZE  ], B
+	vmovdqu32	[DIGEST + 2*MD5_DIGEST_ROW_SIZE  ], C
+	vmovdqu32	[DIGEST + 3*MD5_DIGEST_ROW_SIZE  ], D
+	; Store the digest for each stream (9-16)
+	vmovdqu32	[DIGEST + 0 * MD5_DIGEST_ROW_SIZE + 64], A1
+	vmovdqu32	[DIGEST + 1 * MD5_DIGEST_ROW_SIZE + 64], B1
+	vmovdqu32	[DIGEST + 2 * MD5_DIGEST_ROW_SIZE + 64], C1
+	vmovdqu32	[DIGEST + 3 * MD5_DIGEST_ROW_SIZE + 64], D1
+
+	mov	rsp, [rsp + _RSP_SAVE]
+	ret
+
+section .data
+align 64
+MD5_TABLE:
+	dd	0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee
+	dd	0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501
+	dd	0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be
+	dd	0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821
+	dd	0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa
+	dd	0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8
+	dd	0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed
+	dd	0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a
+	dd	0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c
+	dd	0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70
+	dd	0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05
+	dd	0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665
+	dd	0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039
+	dd	0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1
+	dd	0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1
+	dd	0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
+
+PSHUFFLE_TRANSPOSE16_MASK1: 	dq 0x0000000000000000
+				dq 0x0000000000000001
+				dq 0x0000000000000008
+				dq 0x0000000000000009
+				dq 0x0000000000000004
+				dq 0x0000000000000005
+				dq 0x000000000000000C
+				dq 0x000000000000000D
+
+PSHUFFLE_TRANSPOSE16_MASK2: 	dq 0x0000000000000002
+				dq 0x0000000000000003
+				dq 0x000000000000000A
+				dq 0x000000000000000B
+				dq 0x0000000000000006
+				dq 0x0000000000000007
+				dq 0x000000000000000E
+				dq 0x000000000000000F
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_md5_mb_x16x2_avx512
+no_md5_mb_x16x2_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm
new file mode 100644
index 000000000..afca137bd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm
@@ -0,0 +1,783 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+; clobbers all XMM registers
+; clobbers all GPRs except arg1 and r8
+
+;; code to compute octal MD5 using AVX
+
+; clobbers all XMM registers
+; clobbers all GPRs except arg1 and r8
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {b3 b2 a3 a2}
+
+	vshufps	%%t1, %%r2, %%r3, 0x44	; t1 = {d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {d3 d2 c3 c2}
+
+	vshufps	%%r1, %%t0, %%t1, 0xDD	; r1 = {d1 c1 b1 a1}
+	vshufps	%%r3, %%r0, %%r2, 0xDD	; r3 = {d3 c3 b3 a3}
+
+	vshufps	%%r0, %%r0, %%r2, 0x88	; r0 = {d2 c2 b2 a2}
+	vshufps	%%t0, %%t0, %%t1, 0x88	; t0 = {d0 c0 b0 a0}
+%endmacro
+
+;;
+;; Magic functions defined in RFC 1321
+;;
+; macro MAGIC_F F,X,Y,Z   ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
+%macro MAGIC_F 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+   ;movdqa   %%F,%%Z
+   vpxor     %%F,%%Z, %%Y
+   vpand     %%F,%%F,%%X
+   vpxor     %%F,%%F,%%Z
+%endmacro
+
+; macro MAGIC_G F,X,Y,Z   ;; F = F((Z),(X),(Y))
+%macro MAGIC_G 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+   MAGIC_F  %%F,%%Z,%%X,%%Y
+%endmacro
+
+; macro MAGIC_H F,X,Y,Z   ;; F = ((X) ^ (Y) ^ (Z))
+%macro MAGIC_H 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+   ;movdqa   %%F,%%Z
+   vpxor     %%F,%%Z, %%Y
+   vpxor     %%F,%%F, %%X
+%endmacro
+
+; macro MAGIC_I F,X,Y,Z   ;; F =  ((Y) ^ ((X) | ~(Z)))
+%macro MAGIC_I 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+   vpcmpeqd  %%F,%%F,%%F     ; 0xFFFF
+   vpxor     %%F,%%F,%%Z  ; pnot     %%Z
+   vpor      %%F,%%F,%%X
+   vpxor     %%F,%%F,%%Y
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	;movdqa	%%tmp, %%reg
+	vpsrld	%%tmp, %%reg, (32-%%imm)
+	vpslld	%%reg, %%reg, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot
+%macro MD5_STEP1 14
+%define %%MAGIC_FUN	%1
+%define %%A		%2
+%define %%B		%3
+%define %%C		%4
+%define %%D		%5
+%define %%A2		%6
+%define %%B2		%7
+%define %%C2		%8
+%define %%D2		%9
+%define %%FUN		%10
+%define %%TMP		%11
+%define %%data		%12
+%define %%MD5const	%13
+%define %%nrot		%14
+
+	vpaddd       %%A, %%A, %%MD5const
+		vpaddd       %%A2, %%A2, %%MD5const
+	vpaddd       %%A, %%A, [%%data]
+		vpaddd       %%A2, %%A2, [%%data + 16*16]
+	%%MAGIC_FUN %%FUN, %%B,%%C,%%D
+	vpaddd       %%A, %%A, %%FUN
+		%%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2
+		vpaddd       %%A2, %%A2, %%FUN
+	PROLD       %%A,%%nrot, %%TMP
+		PROLD       %%A2,%%nrot, %%TMP
+	vpaddd       %%A, %%A, %%B
+		vpaddd       %%A2, %%A2, %%B2
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
+;                MD5const, nrot
+%macro MD5_STEP 16
+%define %%MAGIC_FUN	%1
+%define %%A		%2
+%define %%B		%3
+%define %%C		%4
+%define %%D		%5
+%define %%A2		%6
+%define %%B2		%7
+%define %%C2		%8
+%define %%D2		%9
+%define %%FUN		%10
+%define %%TMP		%11
+%define %%FUN2		%12
+%define %%TMP2		%13
+%define %%data		%14
+%define %%MD5const	%15
+%define %%nrot		%16
+
+	vmovdqa      %%TMP,[%%data]
+		vmovdqa      %%TMP2,[%%data + 16*16]
+	vpaddd       %%A, %%A, %%MD5const
+		vpaddd       %%A2, %%A2, %%MD5const
+	vpaddd       %%A, %%A, %%TMP
+		vpaddd       %%A2, %%A2, %%TMP2
+	%%MAGIC_FUN %%FUN, %%B,%%C,%%D
+		%%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2
+	vpaddd       %%A, %%A, %%FUN
+		vpaddd       %%A2, %%A2, %%FUN2
+	PROLD       %%A,%%nrot, %%TMP
+		PROLD       %%A2,%%nrot, %%TMP2
+	vpaddd       %%A, %%A, %%B
+		vpaddd       %%A2, %%A2, %%B2
+%endmacro
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+rot11 equ  7
+rot12 equ  12
+rot13 equ  17
+rot14 equ  22
+rot21 equ  5
+rot22 equ  9
+rot23 equ  14
+rot24 equ  20
+rot31 equ  4
+rot32 equ  11
+rot33 equ  16
+rot34 equ  23
+rot41 equ  6
+rot42 equ  10
+rot43 equ  15
+rot44 equ  21
+
+%define A	xmm0
+%define B	xmm1
+%define C	xmm2
+%define D	xmm3
+%define E	xmm4 ; tmp
+%define F	xmm5 ; tmp
+
+%define A2	xmm6
+%define B2	xmm7
+%define C2	xmm8
+%define D2	xmm9
+
+
+%define FUN	E
+%define TMP	F
+%define FUN2	xmm10
+%define TMP2	xmm11
+
+%define T0	xmm10
+%define T1	xmm11
+%define T2	xmm12
+%define T3	xmm13
+%define T4	xmm14
+%define T5	xmm15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;; Linux Registers
+%define arg1 rdi
+%define arg2 rsi
+%define inp7 rcx
+%define mem1 rdx
+%else
+;; Windows Registers
+%define arg1 rcx
+%define arg2 rdx
+%define inp7 rdi
+%define mem1 rsi
+%endif
+; r8 is not used
+
+; Common definitions
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define TBL  rax
+%define IDX  rbx
+%define mem2 rbp
+
+
+
+
+
+; Stack Layout
+;
+; 470 DD2
+; 460 CC2
+; 450 BB2
+; 440 AA2
+; 430 DD
+; 420 CC
+; 410 BB
+; 400 AA
+;
+; 3F0 data2[15] for lanes 7...4   \
+; ...                              \
+; 300 data2[0]  for lanes 7...4     \
+; 2F0 data2[15] for lanes 3...0      > mem block 2
+; ...                               /
+; 210 data2[1]  for lanes 3...0    /
+; 200 data2[0]  for lanes 3...0   /
+;
+; 1F0 data1[15] for lanes 7...4   \
+; ...                              \
+; 100 data1[0]  for lanes 7...4     \
+;  F0 data1[15] for lanes 3...0      > mem block 1
+; ...                               /
+;  10 data1[1]  for lanes 3...0    /
+;   0 data1[0]  for lanes 3...0   /
+
+MEM             equ     16*16*2*2       ; two blocks of data stored in stack
+; STACK_SIZE must be an odd multiple of 8 bytes in size
+STACK_SIZE      equ     MEM + 16*8 + 8
+
+%define AA      rsp + MEM + 16*0
+%define BB      rsp + MEM + 16*1
+%define CC      rsp + MEM + 16*2
+%define DD      rsp + MEM + 16*3
+%define AA2     rsp + MEM + 16*4
+%define BB2     rsp + MEM + 16*5
+%define CC2     rsp + MEM + 16*6
+%define DD2     rsp + MEM + 16*7
+
+;;%define DIGEST_SIZE 	(8*4*4)	; 8 streams x 4 32bit words per digest x 4 bytes per word
+
+;#define NUM_MD5_DIGEST_WORDS 4
+;#define NUM_LANES 8
+;#define MD5_BLOCK_SIZE 64
+;
+;typedef UINT32 digest_array[NUM_MD5_DIGEST_WORDS][NUM_LANES];
+;
+;typedef struct {
+;       DECLARE_ALIGNED(digest_array digest,              16);
+;                       UINT8*       data_ptr[NUM_LANES];
+;} MD5_ARGS_X8;
+
+; void md5_mb_x4x2_avx(MD5_ARGS_X8 *args, UINT64 size)
+; arg 1 : pointer to MD5_ARGS_X8 structure
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+;
+; arg1 and r8 are maintained by this function
+;
+align 32
+mk_global md5_mb_x4x2_avx, function, internal
+md5_mb_x4x2_avx:
+	endbranch
+	sub	rsp, STACK_SIZE
+
+	;; Initialize digests
+	vmovdqu	A,[arg1+0*16]
+	vmovdqu	B,[arg1+2*16]
+	vmovdqu	C,[arg1+4*16]
+	vmovdqu	D,[arg1+6*16]
+
+		vmovdqu	A2,[arg1+1*16]
+		vmovdqu	B2,[arg1+3*16]
+		vmovdqu	C2,[arg1+5*16]
+		vmovdqu	D2,[arg1+7*16]
+
+	lea	TBL, [MD5_TABLE]
+
+        ;; load input pointers
+	mov	inp0,[arg1 + _data_ptr + 0*8]
+	mov	inp1,[arg1 + _data_ptr + 1*8]
+	mov	inp2,[arg1 + _data_ptr + 2*8]
+	mov	inp3,[arg1 + _data_ptr + 3*8]
+		mov	inp4,[arg1 + _data_ptr + 4*8]
+		mov	inp5,[arg1 + _data_ptr + 5*8]
+		mov	inp6,[arg1 + _data_ptr + 6*8]
+		mov	inp7,[arg1 + _data_ptr + 7*8]
+
+	xor	IDX, IDX
+
+        ; Make ping-pong pointers to the two memory blocks
+        mov     mem1, rsp
+        lea     mem2, [rsp + 16*16*2]
+
+
+;; Load first block of data and save back to stack
+%assign I 0
+%rep 4
+	vmovdqu	T2,[inp0+IDX+I*16]
+	vmovdqu	T1,[inp1+IDX+I*16]
+	vmovdqu	T4,[inp2+IDX+I*16]
+	vmovdqu	T3,[inp3+IDX+I*16]
+	TRANSPOSE	T2, T1, T4, T3, T0, T5
+	vmovdqa	[mem1+(I*4+0)*16],T0
+	vmovdqa	[mem1+(I*4+1)*16],T1
+	vmovdqa	[mem1+(I*4+2)*16],T2
+	vmovdqa	[mem1+(I*4+3)*16],T3
+
+	vmovdqu	T2,[inp4+IDX+I*16]
+	vmovdqu	T1,[inp5+IDX+I*16]
+	vmovdqu	T4,[inp6+IDX+I*16]
+	vmovdqu	T3,[inp7+IDX+I*16]
+	TRANSPOSE	T2, T1, T4, T3, T0, T5
+	vmovdqa	[mem1+(I*4+0)*16 + 16*16],T0
+	vmovdqa	[mem1+(I*4+1)*16 + 16*16],T1
+	vmovdqa	[mem1+(I*4+2)*16 + 16*16],T2
+	vmovdqa	[mem1+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+%endrep
+
+lloop:
+
+	; save old digests
+	vmovdqa	[AA], A
+	vmovdqa	[BB], B
+	vmovdqa	[CC], C
+	vmovdqa	[DD], D
+		; save old digests
+		vmovdqa	[AA2], A2
+		vmovdqa	[BB2], B2
+		vmovdqa	[CC2], C2
+		vmovdqa	[DD2], D2
+
+	add	IDX, 4*16
+	sub	arg2, 1
+	je	lastblock
+
+	MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 0*16, [TBL+ 0*16], rot11
+	MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 1*16, [TBL+ 1*16], rot12
+	MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 2*16, [TBL+ 2*16], rot13
+	MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 3*16, [TBL+ 3*16], rot14
+	MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 4*16, [TBL+ 4*16], rot11
+	MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 5*16, [TBL+ 5*16], rot12
+	MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 6*16, [TBL+ 6*16], rot13
+	MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 7*16, [TBL+ 7*16], rot14
+
+%assign I 0
+	vmovdqu	T2,[inp0+IDX+I*16]
+	vmovdqu	T1,[inp1+IDX+I*16]
+	vmovdqu	T4,[inp2+IDX+I*16]
+	vmovdqu	T3,[inp3+IDX+I*16]
+	TRANSPOSE	T2, T1, T4, T3, T0, T5
+	vmovdqa	[mem2+(I*4+0)*16],T0
+	vmovdqa	[mem2+(I*4+1)*16],T1
+	vmovdqa	[mem2+(I*4+2)*16],T2
+	vmovdqa	[mem2+(I*4+3)*16],T3
+
+	MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 8*16, [TBL+ 8*16], rot11
+	MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 9*16, [TBL+ 9*16], rot12
+	MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+10*16, [TBL+10*16], rot13
+	MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+11*16, [TBL+11*16], rot14
+	MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+12*16, [TBL+12*16], rot11
+	MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+13*16, [TBL+13*16], rot12
+	MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+14*16, [TBL+14*16], rot13
+	MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+15*16, [TBL+15*16], rot14
+
+
+	vmovdqu	T2,[inp4+IDX+I*16]
+	vmovdqu	T1,[inp5+IDX+I*16]
+	vmovdqu	T4,[inp6+IDX+I*16]
+	vmovdqu	T3,[inp7+IDX+I*16]
+	TRANSPOSE	T2, T1, T4, T3, T0, T5
+	vmovdqa	[mem2+(I*4+0)*16 + 16*16],T0
+	vmovdqa	[mem2+(I*4+1)*16 + 16*16],T1
+	vmovdqa	[mem2+(I*4+2)*16 + 16*16],T2
+	vmovdqa	[mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+	MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 1*16, [TBL+16*16], rot21
+	MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 6*16, [TBL+17*16], rot22
+	MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+11*16, [TBL+18*16], rot23
+	MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 0*16, [TBL+19*16], rot24
+	MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 5*16, [TBL+20*16], rot21
+	MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+10*16, [TBL+21*16], rot22
+	MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+15*16, [TBL+22*16], rot23
+	MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 4*16, [TBL+23*16], rot24
+
+	vmovdqu	T2,[inp0+IDX+I*16]
+	vmovdqu	T1,[inp1+IDX+I*16]
+	vmovdqu	T4,[inp2+IDX+I*16]
+	vmovdqu	T3,[inp3+IDX+I*16]
+	TRANSPOSE	T2, T1, T4, T3, T0, T5
+	vmovdqa	[mem2+(I*4+0)*16],T0
+	vmovdqa	[mem2+(I*4+1)*16],T1
+	vmovdqa	[mem2+(I*4+2)*16],T2
+	vmovdqa	[mem2+(I*4+3)*16],T3
+
+	MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 9*16, [TBL+24*16], rot21
+	MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+14*16, [TBL+25*16], rot22
+	MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 3*16, [TBL+26*16], rot23
+	MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 8*16, [TBL+27*16], rot24
+	MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+13*16, [TBL+28*16], rot21
+	MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 2*16, [TBL+29*16], rot22
+	MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 7*16, [TBL+30*16], rot23
+	MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+12*16, [TBL+31*16], rot24
+
+	vmovdqu	T2,[inp4+IDX+I*16]
+	vmovdqu	T1,[inp5+IDX+I*16]
+	vmovdqu	T4,[inp6+IDX+I*16]
+	vmovdqu	T3,[inp7+IDX+I*16]
+	TRANSPOSE	T2, T1, T4, T3, T0, T5
+	vmovdqa	[mem2+(I*4+0)*16 + 16*16],T0
+	vmovdqa	[mem2+(I*4+1)*16 + 16*16],T1
+	vmovdqa	[mem2+(I*4+2)*16 + 16*16],T2
+	vmovdqa	[mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+	MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 5*16, [TBL+32*16], rot31
+	MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 8*16, [TBL+33*16], rot32
+	MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+11*16, [TBL+34*16], rot33
+	MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+14*16, [TBL+35*16], rot34
+	MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 1*16, [TBL+36*16], rot31
+	MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 4*16, [TBL+37*16], rot32
+	MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 7*16, [TBL+38*16], rot33
+	MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+10*16, [TBL+39*16], rot34
+
+	vmovdqu	T2,[inp0+IDX+I*16]
+	vmovdqu	T1,[inp1+IDX+I*16]
+	vmovdqu	T4,[inp2+IDX+I*16]
+	vmovdqu	T3,[inp3+IDX+I*16]
+	TRANSPOSE	T2, T1, T4, T3, T0, T5
+	vmovdqa	[mem2+(I*4+0)*16],T0
+	vmovdqa	[mem2+(I*4+1)*16],T1
+	vmovdqa	[mem2+(I*4+2)*16],T2
+	vmovdqa	[mem2+(I*4+3)*16],T3
+
+	MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+13*16, [TBL+40*16], rot31
+	MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 0*16, [TBL+41*16], rot32
+	MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 3*16, [TBL+42*16], rot33
+	MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 6*16, [TBL+43*16], rot34
+	MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 9*16, [TBL+44*16], rot31
+	MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+12*16, [TBL+45*16], rot32
+	MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+15*16, [TBL+46*16], rot33
+	MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 2*16, [TBL+47*16], rot34
+
+	vmovdqu	T2,[inp4+IDX+I*16]
+	vmovdqu	T1,[inp5+IDX+I*16]
+	vmovdqu	T4,[inp6+IDX+I*16]
+	vmovdqu	T3,[inp7+IDX+I*16]
+	TRANSPOSE	T2, T1, T4, T3, T0, T5
+	vmovdqa	[mem2+(I*4+0)*16 + 16*16],T0
+	vmovdqa	[mem2+(I*4+1)*16 + 16*16],T1
+	vmovdqa	[mem2+(I*4+2)*16 + 16*16],T2
+	vmovdqa	[mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+	MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 0*16, [TBL+48*16], rot41
+	MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 7*16, [TBL+49*16], rot42
+	MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+14*16, [TBL+50*16], rot43
+	MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 5*16, [TBL+51*16], rot44
+	MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+12*16, [TBL+52*16], rot41
+	MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 3*16, [TBL+53*16], rot42
+	MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+10*16, [TBL+54*16], rot43
+	MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 1*16, [TBL+55*16], rot44
+
+	vmovdqu	T2,[inp0+IDX+I*16]
+	vmovdqu	T1,[inp1+IDX+I*16]
+	vmovdqu	T4,[inp2+IDX+I*16]
+	vmovdqu	T3,[inp3+IDX+I*16]
+	TRANSPOSE	T2, T1, T4, T3, T0, T5
+	vmovdqa	[mem2+(I*4+0)*16],T0
+	vmovdqa	[mem2+(I*4+1)*16],T1
+	vmovdqa	[mem2+(I*4+2)*16],T2
+	vmovdqa	[mem2+(I*4+3)*16],T3
+
+	MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 8*16, [TBL+56*16], rot41
+	MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+15*16, [TBL+57*16], rot42
+	MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 6*16, [TBL+58*16], rot43
+	MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+13*16, [TBL+59*16], rot44
+	MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 4*16, [TBL+60*16], rot41
+	MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+11*16, [TBL+61*16], rot42
+	MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 2*16, [TBL+62*16], rot43
+	MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 9*16, [TBL+63*16], rot44
+
+	vmovdqu	T2,[inp4+IDX+I*16]
+	vmovdqu	T1,[inp5+IDX+I*16]
+	vmovdqu	T4,[inp6+IDX+I*16]
+	vmovdqu	T3,[inp7+IDX+I*16]
+	TRANSPOSE	T2, T1, T4, T3, T0, T5
+	vmovdqa	[mem2+(I*4+0)*16 + 16*16],T0
+	vmovdqa	[mem2+(I*4+1)*16 + 16*16],T1
+	vmovdqa	[mem2+(I*4+2)*16 + 16*16],T2
+	vmovdqa	[mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+
+	vpaddd	A,A,[AA]
+	vpaddd	B,B,[BB]
+	vpaddd	C,C,[CC]
+	vpaddd	D,D,[DD]
+
+		vpaddd	A2,A2,[AA2]
+		vpaddd	B2,B2,[BB2]
+		vpaddd	C2,C2,[CC2]
+		vpaddd	D2,D2,[DD2]
+
+        ; swap mem1 and mem2
+        xchg    mem1, mem2
+
+	jmp	lloop
+
+lastblock:
+
+	MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+ 0*16], rot11
+	MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+ 1*16], rot12
+	MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+ 2*16], rot13
+	MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+ 3*16], rot14
+	MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+ 4*16], rot11
+	MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+ 5*16], rot12
+	MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+ 6*16], rot13
+	MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+ 7*16], rot14
+	MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+ 8*16], rot11
+	MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+ 9*16], rot12
+	MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+10*16], rot13
+	MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+11*16], rot14
+	MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+12*16], rot11
+	MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+13*16], rot12
+	MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+14*16], rot13
+	MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+15*16], rot14
+
+	MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+16*16], rot21
+	MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+17*16], rot22
+	MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+18*16], rot23
+	MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+19*16], rot24
+	MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+20*16], rot21
+	MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+21*16], rot22
+	MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+22*16], rot23
+	MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+23*16], rot24
+	MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+24*16], rot21
+	MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+25*16], rot22
+	MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+26*16], rot23
+	MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+27*16], rot24
+	MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+28*16], rot21
+	MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+29*16], rot22
+	MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+30*16], rot23
+	MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+31*16], rot24
+
+	MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+32*16], rot31
+	MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+33*16], rot32
+	MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+34*16], rot33
+	MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+35*16], rot34
+	MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+36*16], rot31
+	MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+37*16], rot32
+	MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+38*16], rot33
+	MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+39*16], rot34
+	MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+40*16], rot31
+	MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+41*16], rot32
+	MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+42*16], rot33
+	MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+43*16], rot34
+	MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+44*16], rot31
+	MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+45*16], rot32
+	MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+46*16], rot33
+	MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+47*16], rot34
+
+	MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+48*16], rot41
+	MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+49*16], rot42
+	MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+50*16], rot43
+	MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+51*16], rot44
+	MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+52*16], rot41
+	MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+53*16], rot42
+	MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+54*16], rot43
+	MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+55*16], rot44
+	MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+56*16], rot41
+	MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+57*16], rot42
+	MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+58*16], rot43
+	MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+59*16], rot44
+	MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+60*16], rot41
+	MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+61*16], rot42
+	MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+62*16], rot43
+	MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+63*16], rot44
+
+	vpaddd	A,A,[AA]
+	vpaddd	B,B,[BB]
+	vpaddd	C,C,[CC]
+	vpaddd	D,D,[DD]
+
+		vpaddd	A2,A2,[AA2]
+		vpaddd	B2,B2,[BB2]
+		vpaddd	C2,C2,[CC2]
+		vpaddd	D2,D2,[DD2]
+
+        ; write out digests
+	vmovdqu	[arg1+0*16], A
+	vmovdqu	[arg1+2*16], B
+	vmovdqu	[arg1+4*16], C
+	vmovdqu	[arg1+6*16], D
+
+		vmovdqu	[arg1+1*16], A2
+		vmovdqu	[arg1+3*16], B2
+		vmovdqu	[arg1+5*16], C2
+		vmovdqu	[arg1+7*16], D2
+
+	;; update input pointers
+	add	inp0, IDX
+	add	inp1, IDX
+	add	inp2, IDX
+	add	inp3, IDX
+	add	inp4, IDX
+	add	inp5, IDX
+	add	inp6, IDX
+	add	inp7, IDX
+	mov	[arg1 + _data_ptr + 0*8], inp0
+	mov	[arg1 + _data_ptr + 1*8], inp1
+	mov	[arg1 + _data_ptr + 2*8], inp2
+	mov	[arg1 + _data_ptr + 3*8], inp3
+	mov	[arg1 + _data_ptr + 4*8], inp4
+	mov	[arg1 + _data_ptr + 5*8], inp5
+	mov	[arg1 + _data_ptr + 6*8], inp6
+	mov	[arg1 + _data_ptr + 7*8], inp7
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+        add     rsp, STACK_SIZE
+
+	ret
+
+section .data align=64
+
+align 64
+MD5_TABLE:
+	dd	0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+	dd	0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+	dd	0x242070db, 0x242070db, 0x242070db, 0x242070db
+	dd	0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+	dd	0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+	dd	0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+	dd	0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+	dd	0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+	dd	0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+	dd	0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+	dd	0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+	dd	0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+	dd	0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+	dd	0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+	dd	0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+	dd	0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+	dd	0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+	dd	0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+	dd	0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+	dd	0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+	dd	0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+	dd	0x02441453, 0x02441453, 0x02441453, 0x02441453
+	dd	0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+	dd	0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+	dd	0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+	dd	0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+	dd	0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+	dd	0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+	dd	0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+	dd	0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+	dd	0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+	dd	0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+	dd	0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+	dd	0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+	dd	0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+	dd	0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+	dd	0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+	dd	0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+	dd	0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+	dd	0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+	dd	0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+	dd	0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+	dd	0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+	dd	0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+	dd	0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+	dd	0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+	dd	0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+	dd	0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+	dd	0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+	dd	0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+	dd	0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+	dd	0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+	dd	0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+	dd	0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+	dd	0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+	dd	0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+	dd	0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+	dd	0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+	dd	0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+	dd	0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+	dd	0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+	dd	0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+	dd	0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+	dd	0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm
new file mode 100644
index 000000000..b3b946634
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm
@@ -0,0 +1,779 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+; clobbers all XMM registers
+; clobbers all GPRs except arg1 and r8
+
+;; code to compute octal MD5 using SSE
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+        movdqa  %%t0, %%r0
+        shufps  %%t0, %%r1, 0x44        ; t0 = {b1 b0 a1 a0}
+        shufps  %%r0, %%r1, 0xEE        ; r0 = {b3 b2 a3 a2}
+
+        movdqa  %%t1, %%r2
+        shufps  %%t1, %%r3, 0x44        ; t1 = {d1 d0 c1 c0}
+        shufps  %%r2, %%r3, 0xEE        ; r2 = {d3 d2 c3 c2}
+
+        movdqa  %%r1, %%t0
+        shufps  %%r1, %%t1, 0xDD        ; r1 = {d1 c1 b1 a1}
+
+        movdqa  %%r3, %%r0
+        shufps  %%r3, %%r2, 0xDD        ; r3 = {d3 c3 b3 a3}
+
+        shufps  %%r0, %%r2, 0x88        ; r0 = {d2 c2 b2 a2}
+        shufps  %%t0, %%t1, 0x88        ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+;;
+;; Magic functions defined in RFC 1321
+;;
+; macro MAGIC_F F,X,Y,Z   ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
+%macro MAGIC_F 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+        movdqa  %%F,%%Z
+        pxor    %%F,%%Y
+        pand    %%F,%%X
+        pxor    %%F,%%Z
+%endmacro
+
+; macro MAGIC_G F,X,Y,Z   ;; F = F((Z),(X),(Y))
+%macro MAGIC_G 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+        MAGIC_F %%F,%%Z,%%X,%%Y
+%endmacro
+
+; macro MAGIC_H F,X,Y,Z   ;; F = ((X) ^ (Y) ^ (Z))
+%macro MAGIC_H 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+        movdqa  %%F,%%Z
+        pxor    %%F,%%Y
+        pxor    %%F,%%X
+%endmacro
+
+; macro MAGIC_I F,X,Y,Z   ;; F =  ((Y) ^ ((X) | ~(Z)))
+%macro MAGIC_I 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+        pcmpeqd %%F,%%F
+        pxor    %%F,%%Z  ; pnot     %%Z
+        por     %%F,%%X
+        pxor    %%F,%%Y
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+        movdqa  %%tmp, %%reg
+        psrld   %%tmp, (32-%%imm)
+        pslld   %%reg, %%imm
+        por     %%reg, %%tmp
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot
+%macro MD5_STEP1 14
+%define %%MAGIC_FUN     %1
+%define %%A             %2
+%define %%B             %3
+%define %%C             %4
+%define %%D             %5
+%define %%A2            %6
+%define %%B2            %7
+%define %%C2            %8
+%define %%D2            %9
+%define %%FUN           %10
+%define %%TMP           %11
+%define %%data          %12
+%define %%MD5const      %13
+%define %%nrot          %14
+
+        paddd       %%A, %%MD5const
+                paddd       %%A2, %%MD5const
+        paddd       %%A, [%%data]
+                paddd       %%A2, [%%data + 16*16]
+        %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+        paddd       %%A, %%FUN
+                %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2
+                paddd       %%A2, %%FUN
+        PROLD       %%A,%%nrot, %%TMP
+                PROLD       %%A2,%%nrot, %%TMP
+        paddd       %%A, %%B
+                paddd       %%A2, %%B2
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
+;                MD5const, nrot
+%macro MD5_STEP 16
+%define %%MAGIC_FUN     %1
+%define %%A             %2
+%define %%B             %3
+%define %%C             %4
+%define %%D             %5
+%define %%A2            %6
+%define %%B2            %7
+%define %%C2            %8
+%define %%D2            %9
+%define %%FUN           %10
+%define %%TMP           %11
+%define %%FUN2          %12
+%define %%TMP2          %13
+%define %%data          %14
+%define %%MD5const      %15
+%define %%nrot          %16
+
+        paddd       %%A, %%MD5const
+                paddd       %%A2, %%MD5const
+        paddd       %%A, [%%data]
+                paddd       %%A2, [%%data + 16*16]
+        %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+                %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2
+        paddd       %%A, %%FUN
+                paddd       %%A2, %%FUN2
+        PROLD       %%A,%%nrot, %%TMP
+                PROLD       %%A2,%%nrot, %%TMP2
+        paddd       %%A, %%B
+                paddd       %%A2, %%B2
+%endmacro
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+rot11 equ  7
+rot12 equ  12
+rot13 equ  17
+rot14 equ  22
+rot21 equ  5
+rot22 equ  9
+rot23 equ  14
+rot24 equ  20
+rot31 equ  4
+rot32 equ  11
+rot33 equ  16
+rot34 equ  23
+rot41 equ  6
+rot42 equ  10
+rot43 equ  15
+rot44 equ  21
+
+%define A       xmm0
+%define B       xmm1
+%define C       xmm2
+%define D       xmm3
+%define E       xmm4 ; tmp
+%define F       xmm5 ; tmp
+
+%define A2      xmm6
+%define B2      xmm7
+%define C2      xmm8
+%define D2      xmm9
+
+
+%define FUN     E
+%define TMP     F
+%define FUN2    xmm10
+%define TMP2    xmm11
+
+%define T0      xmm10
+%define T1      xmm11
+%define T2      xmm12
+%define T3      xmm13
+%define T4      xmm14
+%define T5      xmm15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;; Linux Registers
+%define arg1 rdi
+%define arg2 rsi
+%define inp7 rcx
+%define mem1 rdx
+%else
+;; Windows Registers
+%define arg1 rcx
+%define arg2 rdx
+%define inp7 rdi
+%define mem1 rsi
+%endif
+; r8 is not used
+
+; Common definitions
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+
+%define TBL  rax
+%define IDX  rbx
+%define mem2 rbp
+
+
+; Stack Layout
+;
+; 470 DD2
+; 460 CC2
+; 450 BB2
+; 440 AA2
+; 430 DD
+; 420 CC
+; 410 BB
+; 400 AA
+;
+; 3F0 data2[15] for lanes 7...4   \
+; ...                              \
+; 300 data2[0]  for lanes 7...4     \
+; 2F0 data2[15] for lanes 3...0      > mem block 2
+; ...                               /
+; 210 data2[1]  for lanes 3...0    /
+; 200 data2[0]  for lanes 3...0   /
+;
+; 1F0 data1[15] for lanes 7...4   \
+; ...                              \
+; 100 data1[0]  for lanes 7...4     \
+;  F0 data1[15] for lanes 3...0      > mem block 1
+; ...                               /
+;  10 data1[1]  for lanes 3...0    /
+;   0 data1[0]  for lanes 3...0   /
+
+MEM             equ     16*16*2*2       ; two blocks of data stored in stack
+; STACK_SIZE must be an odd multiple of 8 bytes in size
+STACK_SIZE      equ     MEM + 16*8 + 8
+
+%define AA      rsp + MEM + 16*0
+%define BB      rsp + MEM + 16*1
+%define CC      rsp + MEM + 16*2
+%define DD      rsp + MEM + 16*3
+%define AA2     rsp + MEM + 16*4
+%define BB2     rsp + MEM + 16*5
+%define CC2     rsp + MEM + 16*6
+%define DD2     rsp + MEM + 16*7
+
+;;%define DIGEST_SIZE     (8*4*4) ; 8 streams x 4 32bit words per digest x 4 bytes per word
+
+;#define NUM_MD5_DIGEST_WORDS 4
+;#define NUM_LANES 8
+;#define MD5_BLOCK_SIZE 64
+;
+;typedef UINT32 digest_array[NUM_MD5_DIGEST_WORDS][NUM_LANES];
+;
+;typedef struct {
+;       DECLARE_ALIGNED(digest_array digest,              16);
+;                       UINT8*       data_ptr[NUM_LANES];
+;} MD5_ARGS_X8;
+
+; void md5_mb_x4x2_sse(MD5_ARGS_X8 *args, UINT64 size)
+; arg 1 : pointer to MD5_ARGS_X8 structure
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+;
+; arg1 and r8 are maintained by this function
+;
+align 32
+mk_global md5_mb_x4x2_sse, function, internal
+md5_mb_x4x2_sse:
+	endbranch
+        sub     rsp, STACK_SIZE
+
+        ;; Initialize digests
+        movdqu  A,[arg1+0*16]
+        movdqu  B,[arg1+2*16]
+        movdqu  C,[arg1+4*16]
+        movdqu  D,[arg1+6*16]
+
+                ;; Initialize digests
+                movdqu  A2,[arg1+1*16]
+                movdqu  B2,[arg1+3*16]
+                movdqu  C2,[arg1+5*16]
+                movdqu  D2,[arg1+7*16]
+
+        lea     TBL, [MD5_TABLE]
+
+        ;; load input pointers
+        mov     inp0,[arg1 + _data_ptr + 0*8]
+        mov     inp1,[arg1 + _data_ptr + 1*8]
+        mov     inp2,[arg1 + _data_ptr + 2*8]
+        mov     inp3,[arg1 + _data_ptr + 3*8]
+                mov     inp4,[arg1 + _data_ptr + 4*8]
+                mov     inp5,[arg1 + _data_ptr + 5*8]
+                mov     inp6,[arg1 + _data_ptr + 6*8]
+                mov     inp7,[arg1 + _data_ptr + 7*8]
+        xor     IDX, IDX
+
+        ; Make ping-pong pointers to the two memory blocks
+        mov     mem1, rsp
+        lea     mem2, [rsp + 16*16*2]
+
+
+;; Load first block of data and save back to stack
+%assign I 0
+%rep 4
+        movdqu  T2,[inp0+IDX+I*16]
+        movdqu  T1,[inp1+IDX+I*16]
+        movdqu  T4,[inp2+IDX+I*16]
+        movdqu  T3,[inp3+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem1+(I*4+0)*16],T0
+        movdqa  [mem1+(I*4+1)*16],T1
+        movdqa  [mem1+(I*4+2)*16],T2
+        movdqa  [mem1+(I*4+3)*16],T3
+
+        movdqu  T2,[inp4+IDX+I*16]
+        movdqu  T1,[inp5+IDX+I*16]
+        movdqu  T4,[inp6+IDX+I*16]
+        movdqu  T3,[inp7+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem1+(I*4+0)*16 + 16*16],T0
+        movdqa  [mem1+(I*4+1)*16 + 16*16],T1
+        movdqa  [mem1+(I*4+2)*16 + 16*16],T2
+        movdqa  [mem1+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+%endrep
+
+lloop:
+        ; save old digests
+        movdqa  [AA], A
+        movdqa  [BB], B
+        movdqa  [CC], C
+        movdqa  [DD], D
+                ; save old digests
+                movdqa  [AA2], A2
+                movdqa  [BB2], B2
+                movdqa  [CC2], C2
+                movdqa  [DD2], D2
+
+        add     IDX, 4*16
+        sub     arg2, 1
+        je      lastblock
+
+        MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11
+        MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12
+        MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13
+        MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14
+        MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11
+        MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12
+        MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13
+        MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14
+
+%assign I 0
+        movdqu  T2,[inp0+IDX+I*16]
+        movdqu  T1,[inp1+IDX+I*16]
+        movdqu  T4,[inp2+IDX+I*16]
+        movdqu  T3,[inp3+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem2+(I*4+0)*16],T0
+        movdqa  [mem2+(I*4+1)*16],T1
+        movdqa  [mem2+(I*4+2)*16],T2
+        movdqa  [mem2+(I*4+3)*16],T3
+
+        MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11
+        MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12
+        MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13
+        MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14
+        MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11
+        MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12
+        MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13
+        MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14
+
+
+        movdqu  T2,[inp4+IDX+I*16]
+        movdqu  T1,[inp5+IDX+I*16]
+        movdqu  T4,[inp6+IDX+I*16]
+        movdqu  T3,[inp7+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem2+(I*4+0)*16 + 16*16],T0
+        movdqa  [mem2+(I*4+1)*16 + 16*16],T1
+        movdqa  [mem2+(I*4+2)*16 + 16*16],T2
+        movdqa  [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+        MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21
+        MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22
+        MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23
+        MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24
+        MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21
+        MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22
+        MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23
+        MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24
+
+        movdqu  T2,[inp0+IDX+I*16]
+        movdqu  T1,[inp1+IDX+I*16]
+        movdqu  T4,[inp2+IDX+I*16]
+        movdqu  T3,[inp3+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem2+(I*4+0)*16],T0
+        movdqa  [mem2+(I*4+1)*16],T1
+        movdqa  [mem2+(I*4+2)*16],T2
+        movdqa  [mem2+(I*4+3)*16],T3
+
+        MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21
+        MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22
+        MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23
+        MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24
+        MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21
+        MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22
+        MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23
+        MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24
+
+        movdqu  T2,[inp4+IDX+I*16]
+        movdqu  T1,[inp5+IDX+I*16]
+        movdqu  T4,[inp6+IDX+I*16]
+        movdqu  T3,[inp7+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem2+(I*4+0)*16 + 16*16],T0
+        movdqa  [mem2+(I*4+1)*16 + 16*16],T1
+        movdqa  [mem2+(I*4+2)*16 + 16*16],T2
+        movdqa  [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+        MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31
+        MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32
+        MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33
+        MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34
+        MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31
+        MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32
+        MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33
+        MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34
+
+        movdqu  T2,[inp0+IDX+I*16]
+        movdqu  T1,[inp1+IDX+I*16]
+        movdqu  T4,[inp2+IDX+I*16]
+        movdqu  T3,[inp3+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem2+(I*4+0)*16],T0
+        movdqa  [mem2+(I*4+1)*16],T1
+        movdqa  [mem2+(I*4+2)*16],T2
+        movdqa  [mem2+(I*4+3)*16],T3
+
+        MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31
+        MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32
+        MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33
+        MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34
+        MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31
+        MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32
+        MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33
+        MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34
+
+        movdqu  T2,[inp4+IDX+I*16]
+        movdqu  T1,[inp5+IDX+I*16]
+        movdqu  T4,[inp6+IDX+I*16]
+        movdqu  T3,[inp7+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem2+(I*4+0)*16 + 16*16],T0
+        movdqa  [mem2+(I*4+1)*16 + 16*16],T1
+        movdqa  [mem2+(I*4+2)*16 + 16*16],T2
+        movdqa  [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+        MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41
+        MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42
+        MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43
+        MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44
+        MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41
+        MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42
+        MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43
+        MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44
+
+        movdqu  T2,[inp0+IDX+I*16]
+        movdqu  T1,[inp1+IDX+I*16]
+        movdqu  T4,[inp2+IDX+I*16]
+        movdqu  T3,[inp3+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem2+(I*4+0)*16],T0
+        movdqa  [mem2+(I*4+1)*16],T1
+        movdqa  [mem2+(I*4+2)*16],T2
+        movdqa  [mem2+(I*4+3)*16],T3
+
+        MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41
+        MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42
+        MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43
+        MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44
+        MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41
+        MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42
+        MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43
+        MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44
+
+        movdqu  T2,[inp4+IDX+I*16]
+        movdqu  T1,[inp5+IDX+I*16]
+        movdqu  T4,[inp6+IDX+I*16]
+        movdqu  T3,[inp7+IDX+I*16]
+        TRANSPOSE       T2, T1, T4, T3, T0, T5
+        movdqa  [mem2+(I*4+0)*16 + 16*16],T0
+        movdqa  [mem2+(I*4+1)*16 + 16*16],T1
+        movdqa  [mem2+(I*4+2)*16 + 16*16],T2
+        movdqa  [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+
+        paddd   A,[AA]
+        paddd   B,[BB]
+        paddd   C,[CC]
+        paddd   D,[DD]
+
+                paddd   A2,[AA2]
+                paddd   B2,[BB2]
+                paddd   C2,[CC2]
+                paddd   D2,[DD2]
+
+        ; swap mem1 and mem2
+        xchg    mem1, mem2
+
+        jmp     lloop
+
+lastblock:
+
+        MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11
+        MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12
+        MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13
+        MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14
+        MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11
+        MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12
+        MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13
+        MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14
+        MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11
+        MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12
+        MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13
+        MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14
+        MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11
+        MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12
+        MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13
+        MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14
+
+        MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21
+        MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22
+        MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23
+        MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24
+        MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21
+        MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22
+        MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23
+        MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24
+        MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21
+        MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22
+        MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23
+        MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24
+        MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21
+        MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22
+        MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23
+        MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24
+
+        MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31
+        MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32
+        MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33
+        MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34
+        MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31
+        MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32
+        MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33
+        MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34
+        MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31
+        MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32
+        MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33
+        MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34
+        MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31
+        MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32
+        MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33
+        MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34
+
+        MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41
+        MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42
+        MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43
+        MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44
+        MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41
+        MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42
+        MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43
+        MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44
+        MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41
+        MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42
+        MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43
+        MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44
+        MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41
+        MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42
+        MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43
+        MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44
+
+        paddd   A,[AA]
+        paddd   B,[BB]
+        paddd   C,[CC]
+        paddd   D,[DD]
+
+                paddd   A2,[AA2]
+                paddd   B2,[BB2]
+                paddd   C2,[CC2]
+                paddd   D2,[DD2]
+
+        ; write out digests
+        movdqu  [arg1+0*16], A
+        movdqu  [arg1+2*16], B
+        movdqu  [arg1+4*16], C
+        movdqu  [arg1+6*16], D
+                movdqu  [arg1+1*16], A2
+                movdqu  [arg1+3*16], B2
+                movdqu  [arg1+5*16], C2
+                movdqu  [arg1+7*16], D2
+
+        ;; update input pointers
+        add     inp0, IDX
+        add     inp1, IDX
+        add     inp2, IDX
+        add     inp3, IDX
+        add     inp4, IDX
+        add     inp5, IDX
+        add     inp6, IDX
+        add     inp7, IDX
+        mov     [arg1 + _data_ptr + 0*8], inp0
+        mov     [arg1 + _data_ptr + 1*8], inp1
+        mov     [arg1 + _data_ptr + 2*8], inp2
+        mov     [arg1 + _data_ptr + 3*8], inp3
+        mov     [arg1 + _data_ptr + 4*8], inp4
+        mov     [arg1 + _data_ptr + 5*8], inp5
+        mov     [arg1 + _data_ptr + 6*8], inp6
+        mov     [arg1 + _data_ptr + 7*8], inp7
+
+        ;;;;;;;;;;;;;;;;
+        ;; Postamble
+        add     rsp, STACK_SIZE
+
+	ret
+
+section .data align=64
+
+align 64
+MD5_TABLE:
+        dd      0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+        dd      0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+        dd      0x242070db, 0x242070db, 0x242070db, 0x242070db
+        dd      0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+        dd      0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+        dd      0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+        dd      0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+        dd      0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+        dd      0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+        dd      0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+        dd      0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+        dd      0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+        dd      0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+        dd      0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+        dd      0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+        dd      0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+        dd      0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+        dd      0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+        dd      0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+        dd      0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+        dd      0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+        dd      0x02441453, 0x02441453, 0x02441453, 0x02441453
+        dd      0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+        dd      0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+        dd      0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+        dd      0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+        dd      0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+        dd      0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+        dd      0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+        dd      0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+        dd      0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+        dd      0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+        dd      0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+        dd      0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+        dd      0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+        dd      0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+        dd      0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+        dd      0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+        dd      0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+        dd      0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+        dd      0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+        dd      0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+        dd      0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+        dd      0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+        dd      0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+        dd      0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+        dd      0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+        dd      0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+        dd      0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+        dd      0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+        dd      0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+        dd      0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+        dd      0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+        dd      0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+        dd      0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+        dd      0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+        dd      0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+        dd      0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+        dd      0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+        dd      0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+        dd      0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+        dd      0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+        dd      0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+        dd      0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm
new file mode 100644
index 000000000..b5d6a4875
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm
@@ -0,0 +1,920 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute double octal MD5 using AVX2
+
+;; Stack must be aligned to 32 bytes before call
+;; Windows clobbers:  rax rbx     rdx rsi rdi     r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves:         rcx             rbp
+;;
+;; Linux clobbers:    rax rbx rcx rdx rsi         r8 r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves:                       rdi rbp
+;;
+;; clobbers ymm0-15
+
+;; clobbers all GPRs other than arg1 and rbp
+
+%ifidn __OUTPUT_FORMAT__, win64
+   %define arg1 rcx
+   %define arg2 rdx
+   %define reg3 rdi
+   %define reg4 rsi
+%else
+   %define arg1 rdi
+   %define arg2 rsi
+   %define reg3 rcx
+   %define reg4 rdx
+%endif
+
+;; rbp is not clobbered
+
+%define state    arg1
+%define num_blks arg2
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+%define inp4 r12
+%define inp5 r13
+%define inp6 r14
+%define inp7 r15
+
+;; These are pointers to data block1 and block2 in the stack
+; which will ping pong back and forth
+%define DPTR1	rbx
+%define DPTR2	reg3
+
+%define TBL  rax
+%define IDX  reg4
+
+;; Transposed Digest Storage
+%define Y_A	ymm0
+%define Y_B	ymm1
+%define Y_C	ymm2
+%define Y_D	ymm3
+%define Y_A2	ymm4
+%define Y_B2	ymm5
+%define Y_C2	ymm6
+%define Y_D2	ymm7
+
+;; Temp YMM registers corresponding to the Temp XMM registers
+;; used during the transposition of the digests
+%define Y_KTMP1	ymm12
+%define Y_KTMP2	ymm13
+;; Temporary registers used during MD5 round operations
+%define Y_FUN	ymm8
+%define Y_TMP	ymm9
+%define Y_FUN2	ymm10
+%define Y_TMP2	ymm11
+
+
+;; YMM registers used during data fetching.
+;; Data are stored into the stack after transposition
+%define Y_DAT0	ymm8
+%define Y_DAT1	ymm9
+%define Y_DAT2	ymm10
+%define Y_DAT3	ymm11
+%define Y_DAT4	ymm12
+%define Y_DAT5	ymm13
+%define Y_DAT6	ymm14
+%define Y_DAT7	ymm15
+
+;; Temporary registers used during data transposition
+%define Y_DTMP1	ymm0
+%define Y_DTMP2	ymm1
+
+
+%define RESY	resb 32*
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESIZE mod 32 must be 32-8 = 24
+struc STACK
+_DATA:		RESY	2*2*16  ; 2 blocks * 2 sets of lanes * 16 regs
+_DIGEST:	RESY	8	; stores Y_AA-Y_DD, Y_AA2-Y_DD2
+_TMPDIGEST:	RESY	2	; stores Y_AA, Y_BB temporarily
+_RSP_SAVE:	RESQ    1 	; original RSP
+endstruc
+
+
+%define	Y_AA	rsp + _DIGEST + 32*0
+%define	Y_BB	rsp + _DIGEST + 32*1
+%define	Y_CC	rsp + _DIGEST + 32*2
+%define	Y_DD	rsp + _DIGEST + 32*3
+%define	Y_AA2	rsp + _DIGEST + 32*4
+%define	Y_BB2	rsp + _DIGEST + 32*5
+%define	Y_CC2	rsp + _DIGEST + 32*6
+%define	Y_DD2	rsp + _DIGEST + 32*7
+
+%define MD5_DIGEST_ROW_SIZE (16*4)
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+rot11 equ  7
+rot12 equ  12
+rot13 equ  17
+rot14 equ  22
+rot21 equ  5
+rot22 equ  9
+rot23 equ  14
+rot24 equ  20
+rot31 equ  4
+rot32 equ  11
+rot33 equ  16
+rot34 equ  23
+rot41 equ  6
+rot42 equ  10
+rot43 equ  15
+rot44 equ  21
+
+; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+; "transpose" data in {r0...r7} using temps {t0...t1}
+; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {a7 a6 a5 a4   a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4   b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4   c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4   d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4   e3 e2 e1 e0}
+; r5 = {f7 f6 f5 f4   f3 f2 f1 f0}
+; r6 = {g7 g6 g5 g4   g3 g2 g1 g0}
+; r7 = {h7 h6 h5 h4   h3 h2 h1 h0}
+;
+; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
+
+;
+%macro TRANSPOSE8 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+
+	; process top half (r0..r3) {a...d}
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {b5 b4 a5 a4   b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {b7 b6 a7 a6   b3 b2 a3 a2}
+	vshufps %%t1, %%r2, %%r3, 0x44	; t1 = {d5 d4 c5 c4   d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {d7 d6 c7 c6   d3 d2 c3 c2}
+	vshufps	%%r3, %%t0, %%t1, 0xDD	; r3 = {d5 c5 b5 a5   d1 c1 b1 a1}
+	vshufps	%%r1, %%r0, %%r2, 0x88	; r1 = {d6 c6 b6 a6   d2 c2 b2 a2}
+	vshufps	%%r0, %%r0, %%r2, 0xDD	; r0 = {d7 c7 b7 a7   d3 c3 b3 a3}
+	vshufps	%%t0, %%t0, %%t1, 0x88	; t0 = {d4 c4 b4 a4   d0 c0 b0 a0}
+
+
+	; use r2 in place of t0
+	; process bottom half (r4..r7) {e...h}
+	vshufps	%%r2, %%r4, %%r5, 0x44	; r2 = {f5 f4 e5 e4   f1 f0 e1 e0}
+	vshufps	%%r4, %%r4, %%r5, 0xEE	; r4 = {f7 f6 e7 e6   f3 f2 e3 e2}
+	vshufps %%t1, %%r6, %%r7, 0x44	; t1 = {h5 h4 g5 g4   h1 h0 g1 g0}
+	vshufps	%%r6, %%r6, %%r7, 0xEE	; r6 = {h7 h6 g7 g6   h3 h2 g3 g2}
+	vshufps	%%r7, %%r2, %%t1, 0xDD	; r7 = {h5 g5 f5 e5   h1 g1 f1 e1}
+	vshufps	%%r5, %%r4, %%r6, 0x88	; r5 = {h6 g6 f6 e6   h2 g2 f2 e2}
+	vshufps	%%r4, %%r4, %%r6, 0xDD	; r4 = {h7 g7 f7 e7   h3 g3 f3 e3}
+	vshufps	%%t1, %%r2, %%t1, 0x88	; t1 = {h4 g4 f4 e4   h0 g0 f0 e0}
+
+
+	vperm2f128	%%r6, %%r5, %%r1, 0x13	; h6...a6
+	vperm2f128	%%r2, %%r5, %%r1, 0x02	; h2...a2
+	vperm2f128	%%r5, %%r7, %%r3, 0x13	; h5...a5
+	vperm2f128	%%r1, %%r7, %%r3, 0x02	; h1...a1
+	vperm2f128	%%r7, %%r4, %%r0, 0x13	; h7...a7
+	vperm2f128	%%r3, %%r4, %%r0, 0x02	; h3...a3
+	vperm2f128	%%r4, %%t1, %%t0, 0x13	; h4...a4
+	vperm2f128	%%r0, %%t1, %%t0, 0x02	; h0...a0
+%endmacro
+
+
+;;
+;; Magic functions defined in RFC 1321
+;;
+; macro MAGIC_F F,X,Y,Z   ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
+%macro MAGIC_F 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+   vpxor     %%F,%%Z, %%Y
+   vpand     %%F,%%F,%%X
+   vpxor     %%F,%%F,%%Z
+%endmacro
+
+; macro MAGIC_G F,X,Y,Z   ;; F = F((Z),(X),(Y))
+%macro MAGIC_G 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+   MAGIC_F  %%F,%%Z,%%X,%%Y
+%endmacro
+
+; macro MAGIC_H F,X,Y,Z   ;; F = ((X) ^ (Y) ^ (Z))
+%macro MAGIC_H 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+   vpxor     %%F,%%Z, %%Y
+   vpxor     %%F,%%F, %%X
+%endmacro
+
+; macro MAGIC_I F,X,Y,Z   ;; F =  ((Y) ^ ((X) | ~(Z)))
+%macro MAGIC_I 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+   vpcmpeqd  %%F,%%F,%%F     ; 0xFFFF
+   vpxor     %%F,%%F,%%Z  ; pnot     %%Z
+   vpor      %%F,%%F,%%X
+   vpxor     %%F,%%F,%%Y
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpsrld	%%tmp, %%reg, (32-%%imm)
+	vpslld	%%reg, %%reg, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
+;                MD5const, nrot
+%macro MD5_STEP 16
+%define %%MAGIC_FUN	%1
+%define %%rA		%2
+%define %%rB		%3
+%define %%rC		%4
+%define %%rD		%5
+%define %%rA2		%6
+%define %%rB2		%7
+%define %%rC2		%8
+%define %%rD2		%9
+%define %%FUN		%10
+%define %%TMP		%11
+%define %%FUN2		%12
+%define %%TMP2		%13
+%define %%data		%14
+%define %%MD5const	%15
+%define %%nrot		%16
+
+	vpaddd       %%rA, %%rA, %%MD5const
+		vpaddd       %%rA2, %%rA2, %%MD5const
+	vpaddd       %%rA, %%rA, [%%data]
+		vpaddd       %%rA2, %%rA2, [%%data + 16*32]
+	%%MAGIC_FUN %%FUN, %%rB,%%rC,%%rD
+		%%MAGIC_FUN %%FUN2, %%rB2,%%rC2,%%rD2
+	vpaddd       %%rA, %%rA, %%FUN
+		vpaddd       %%rA2, %%rA2, %%FUN2
+	PROLD       %%rA,%%nrot, %%TMP
+		PROLD       %%rA2,%%nrot, %%TMP2
+	vpaddd       %%rA, %%rA, %%rB
+		vpaddd       %%rA2, %%rA2, %%rB2
+%endmacro
+
+align 32
+
+; void md5_mb_x8x2_avx2(MD5_ARGS *args, UINT64 num_blks)
+; arg 1 : pointer to MD5_ARGS structure
+; arg 2 : number of blocks (>=1)
+
+mk_global md5_mb_x8x2_avx2, function, internal
+md5_mb_x8x2_avx2:
+	endbranch
+	mov	rax, rsp
+	sub	rsp, STACK_size
+	and	rsp, -32
+	mov	[rsp + _RSP_SAVE], rax
+
+	mov	DPTR1, rsp
+	lea	DPTR2, [rsp + 32*32]
+
+	;; Load MD5 constant pointer to register
+	lea	TBL, [MD5_TABLE]
+
+	; Initialize index for data retrieval
+	xor	IDX, IDX
+
+	;; Fetch Pointers to Data Stream 1 to 8
+	mov	inp0,[state + _data_ptr + 0*8]
+	mov	inp1,[state + _data_ptr + 1*8]
+	mov	inp2,[state + _data_ptr + 2*8]
+	mov	inp3,[state + _data_ptr + 3*8]
+	mov	inp4,[state + _data_ptr + 4*8]
+	mov	inp5,[state + _data_ptr + 5*8]
+	mov	inp6,[state + _data_ptr + 6*8]
+	mov	inp7,[state + _data_ptr + 7*8]
+
+%assign I 0
+%rep 2
+	vmovdqu	Y_DAT0,[inp0+IDX+I*32]
+	vmovdqu	Y_DAT1,[inp1+IDX+I*32]
+	vmovdqu	Y_DAT2,[inp2+IDX+I*32]
+	vmovdqu	Y_DAT3,[inp3+IDX+I*32]
+	vmovdqu	Y_DAT4,[inp4+IDX+I*32]
+	vmovdqu	Y_DAT5,[inp5+IDX+I*32]
+	vmovdqu	Y_DAT6,[inp6+IDX+I*32]
+	vmovdqu	Y_DAT7,[inp7+IDX+I*32]
+	TRANSPOSE8	Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+	vmovdqa	[DPTR1+_DATA+(I*8+0)*32],Y_DAT0
+	vmovdqa	[DPTR1+_DATA+(I*8+1)*32],Y_DAT1
+	vmovdqa	[DPTR1+_DATA+(I*8+2)*32],Y_DAT2
+	vmovdqa	[DPTR1+_DATA+(I*8+3)*32],Y_DAT3
+	vmovdqa	[DPTR1+_DATA+(I*8+4)*32],Y_DAT4
+	vmovdqa	[DPTR1+_DATA+(I*8+5)*32],Y_DAT5
+	vmovdqa	[DPTR1+_DATA+(I*8+6)*32],Y_DAT6
+	vmovdqa	[DPTR1+_DATA+(I*8+7)*32],Y_DAT7
+
+%assign I (I+1)
+%endrep
+
+	;; Fetch Pointers to Data Stream 9 to 16
+	mov	inp0,[state + _data_ptr + 8*8]
+	mov	inp1,[state + _data_ptr + 9*8]
+	mov	inp2,[state + _data_ptr + 10*8]
+	mov	inp3,[state + _data_ptr + 11*8]
+	mov	inp4,[state + _data_ptr + 12*8]
+	mov	inp5,[state + _data_ptr + 13*8]
+	mov	inp6,[state + _data_ptr + 14*8]
+	mov	inp7,[state + _data_ptr + 15*8]
+
+%assign I 0
+%rep 2
+
+	vmovdqu	Y_DAT0,[inp0+IDX+I*32]
+	vmovdqu	Y_DAT1,[inp1+IDX+I*32]
+	vmovdqu	Y_DAT2,[inp2+IDX+I*32]
+	vmovdqu	Y_DAT3,[inp3+IDX+I*32]
+	vmovdqu	Y_DAT4,[inp4+IDX+I*32]
+	vmovdqu	Y_DAT5,[inp5+IDX+I*32]
+	vmovdqu	Y_DAT6,[inp6+IDX+I*32]
+	vmovdqu	Y_DAT7,[inp7+IDX+I*32]
+	TRANSPOSE8	Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+	vmovdqa	[DPTR1+_DATA+((I+2)*8+0)*32],Y_DAT0
+	vmovdqa	[DPTR1+_DATA+((I+2)*8+1)*32],Y_DAT1
+	vmovdqa	[DPTR1+_DATA+((I+2)*8+2)*32],Y_DAT2
+	vmovdqa	[DPTR1+_DATA+((I+2)*8+3)*32],Y_DAT3
+	vmovdqa	[DPTR1+_DATA+((I+2)*8+4)*32],Y_DAT4
+	vmovdqa	[DPTR1+_DATA+((I+2)*8+5)*32],Y_DAT5
+	vmovdqa	[DPTR1+_DATA+((I+2)*8+6)*32],Y_DAT6
+	vmovdqa	[DPTR1+_DATA+((I+2)*8+7)*32],Y_DAT7
+
+%assign I (I+1)
+%endrep
+        ;; digests are already transposed
+	vmovdqu	Y_A,[state + 0 * MD5_DIGEST_ROW_SIZE ]
+	vmovdqu	Y_B,[state + 1 * MD5_DIGEST_ROW_SIZE ]
+	vmovdqu	Y_C,[state + 2 * MD5_DIGEST_ROW_SIZE ]
+        vmovdqu	Y_D,[state + 3 * MD5_DIGEST_ROW_SIZE ]
+
+	; Load the digest for each stream (9-16)
+	vmovdqu	Y_A2,[state + 0 * MD5_DIGEST_ROW_SIZE + 32]
+	vmovdqu	Y_B2,[state + 1 * MD5_DIGEST_ROW_SIZE + 32]
+	vmovdqu	Y_C2,[state + 2 * MD5_DIGEST_ROW_SIZE + 32]
+        vmovdqu	Y_D2,[state + 3 * MD5_DIGEST_ROW_SIZE + 32]
+
+lloop:
+
+	; save old digests to stack
+	vmovdqa	[Y_AA], Y_A
+	vmovdqa	[Y_BB], Y_B
+	vmovdqa	[Y_CC], Y_C
+	vmovdqa	[Y_DD], Y_D
+
+	vmovdqa	[Y_AA2], Y_A2
+	vmovdqa	[Y_BB2], Y_B2
+	vmovdqa	[Y_CC2], Y_C2
+	vmovdqa	[Y_DD2], Y_D2
+
+	;; Increment IDX to point to next data block (64 bytes per block)
+	add	IDX, 64
+
+	;; Update size of remaining blocks to process
+	sub	num_blks, 1
+	je	lastblock
+
+	; Perform the 64 rounds of processing ...
+	MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11
+	MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12
+	MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13
+	MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14
+	MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11
+	MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12
+	MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13
+	MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14
+
+
+	;; Fetch Pointers to Data Stream 1 to 8 ??
+	mov	inp0,[state + _data_ptr + 0*8]
+	mov	inp1,[state + _data_ptr + 1*8]
+	mov	inp2,[state + _data_ptr + 2*8]
+	mov	inp3,[state + _data_ptr + 3*8]
+	mov	inp4,[state + _data_ptr + 4*8]
+	mov	inp5,[state + _data_ptr + 5*8]
+	mov	inp6,[state + _data_ptr + 6*8]
+	mov	inp7,[state + _data_ptr + 7*8]
+
+	MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11
+	MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12
+	MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13
+	MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14
+	MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11
+	MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12
+	MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13
+	MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14
+
+%assign I 0
+
+	; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+	; Therefore we need to save these to stack and restore after transpose
+	vmovdqa  [rsp + _TMPDIGEST + 0*32], Y_A
+	vmovdqa  [rsp + _TMPDIGEST + 1*32], Y_B
+
+	vmovdqu	Y_DAT0,[inp0+IDX+I*32]
+	vmovdqu	Y_DAT1,[inp1+IDX+I*32]
+	vmovdqu	Y_DAT2,[inp2+IDX+I*32]
+	vmovdqu	Y_DAT3,[inp3+IDX+I*32]
+	vmovdqu	Y_DAT4,[inp4+IDX+I*32]
+	vmovdqu	Y_DAT5,[inp5+IDX+I*32]
+	vmovdqu	Y_DAT6,[inp6+IDX+I*32]
+	vmovdqu	Y_DAT7,[inp7+IDX+I*32]
+	TRANSPOSE8	Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+	vmovdqa	[DPTR2+_DATA+(I*8+0)*32],Y_DAT0
+	vmovdqa	[DPTR2+_DATA+(I*8+1)*32],Y_DAT1
+	vmovdqa	[DPTR2+_DATA+(I*8+2)*32],Y_DAT2
+	vmovdqa	[DPTR2+_DATA+(I*8+3)*32],Y_DAT3
+	vmovdqa	[DPTR2+_DATA+(I*8+4)*32],Y_DAT4
+	vmovdqa	[DPTR2+_DATA+(I*8+5)*32],Y_DAT5
+	vmovdqa	[DPTR2+_DATA+(I*8+6)*32],Y_DAT6
+	vmovdqa	[DPTR2+_DATA+(I*8+7)*32],Y_DAT7
+
+	; Restore Y_A and Y_B
+	vmovdqa  Y_A, [rsp + _TMPDIGEST + 0*32]
+	vmovdqa  Y_B, [rsp + _TMPDIGEST + 1*32]
+
+
+	MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21
+	MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22
+	MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23
+	MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24
+	MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21
+	MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22
+	MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23
+	MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24
+	MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21
+	MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22
+	MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23
+	MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24
+	MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21
+	MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22
+	MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23
+	MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24
+
+%assign I (I+1)
+
+	; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+	; Therefore we need to save these to stack and restore after transpose
+	vmovdqa  [rsp + _TMPDIGEST + 0*32], Y_A
+	vmovdqa  [rsp + _TMPDIGEST + 1*32], Y_B
+
+	vmovdqu	Y_DAT0,[inp0+IDX+I*32]
+	vmovdqu	Y_DAT1,[inp1+IDX+I*32]
+	vmovdqu	Y_DAT2,[inp2+IDX+I*32]
+	vmovdqu	Y_DAT3,[inp3+IDX+I*32]
+	vmovdqu	Y_DAT4,[inp4+IDX+I*32]
+	vmovdqu	Y_DAT5,[inp5+IDX+I*32]
+	vmovdqu	Y_DAT6,[inp6+IDX+I*32]
+	vmovdqu	Y_DAT7,[inp7+IDX+I*32]
+	TRANSPOSE8	Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+	vmovdqa	[DPTR2+_DATA+(I*8+0)*32],Y_DAT0
+	vmovdqa	[DPTR2+_DATA+(I*8+1)*32],Y_DAT1
+	vmovdqa	[DPTR2+_DATA+(I*8+2)*32],Y_DAT2
+	vmovdqa	[DPTR2+_DATA+(I*8+3)*32],Y_DAT3
+	vmovdqa	[DPTR2+_DATA+(I*8+4)*32],Y_DAT4
+	vmovdqa	[DPTR2+_DATA+(I*8+5)*32],Y_DAT5
+	vmovdqa	[DPTR2+_DATA+(I*8+6)*32],Y_DAT6
+	vmovdqa	[DPTR2+_DATA+(I*8+7)*32],Y_DAT7
+
+	; Restore Y_A and Y_B
+	vmovdqa  Y_A, [rsp + _TMPDIGEST + 0*32]
+	vmovdqa  Y_B, [rsp + _TMPDIGEST + 1*32]
+
+	MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31
+	MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32
+	MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33
+	MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34
+	MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31
+	MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32
+	MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33
+	MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34
+
+	;; Fetch Pointers to Data Stream 9 to 16
+	mov	inp0,[state + _data_ptr + 8*8]
+	mov	inp1,[state + _data_ptr + 9*8]
+	mov	inp2,[state + _data_ptr + 10*8]
+	mov	inp3,[state + _data_ptr + 11*8]
+	mov	inp4,[state + _data_ptr + 12*8]
+	mov	inp5,[state + _data_ptr + 13*8]
+	mov	inp6,[state + _data_ptr + 14*8]
+	mov	inp7,[state + _data_ptr + 15*8]
+
+	MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31
+	MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32
+	MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33
+	MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34
+	MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31
+	MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32
+	MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33
+	MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34
+
+%assign I 0
+
+	; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+	; Therefore we need to save these to stack and restore after transpose
+	vmovdqa  [rsp + _TMPDIGEST + 0*32], Y_A
+	vmovdqa  [rsp + _TMPDIGEST + 1*32], Y_B
+
+	vmovdqu	Y_DAT0,[inp0+IDX+I*32]
+	vmovdqu	Y_DAT1,[inp1+IDX+I*32]
+	vmovdqu	Y_DAT2,[inp2+IDX+I*32]
+	vmovdqu	Y_DAT3,[inp3+IDX+I*32]
+	vmovdqu	Y_DAT4,[inp4+IDX+I*32]
+	vmovdqu	Y_DAT5,[inp5+IDX+I*32]
+	vmovdqu	Y_DAT6,[inp6+IDX+I*32]
+	vmovdqu	Y_DAT7,[inp7+IDX+I*32]
+	TRANSPOSE8	Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+	vmovdqa	[DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0
+	vmovdqa	[DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1
+	vmovdqa	[DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2
+	vmovdqa	[DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3
+	vmovdqa	[DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4
+	vmovdqa	[DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5
+	vmovdqa	[DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6
+	vmovdqa	[DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7
+
+	; Restore Y_A and Y_B
+	vmovdqa  Y_A, [rsp + _TMPDIGEST + 0*32]
+	vmovdqa  Y_B, [rsp + _TMPDIGEST + 1*32]
+
+	MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41
+	MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42
+	MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43
+	MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44
+	MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41
+	MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42
+	MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43
+	MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44
+	MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41
+	MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42
+	MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43
+	MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44
+	MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41
+	MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42
+	MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43
+	MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44
+
+%assign I (I+1)
+
+	; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+	; Therefore we need to save these to stack and restore after transpose
+	vmovdqa  [rsp + _TMPDIGEST + 0*32], Y_A
+	vmovdqa  [rsp + _TMPDIGEST + 1*32], Y_B
+
+	vmovdqu	Y_DAT0,[inp0+IDX+I*32]
+	vmovdqu	Y_DAT1,[inp1+IDX+I*32]
+	vmovdqu	Y_DAT2,[inp2+IDX+I*32]
+	vmovdqu	Y_DAT3,[inp3+IDX+I*32]
+	vmovdqu	Y_DAT4,[inp4+IDX+I*32]
+	vmovdqu	Y_DAT5,[inp5+IDX+I*32]
+	vmovdqu	Y_DAT6,[inp6+IDX+I*32]
+	vmovdqu	Y_DAT7,[inp7+IDX+I*32]
+	TRANSPOSE8	Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+	vmovdqa	[DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0
+	vmovdqa	[DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1
+	vmovdqa	[DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2
+	vmovdqa	[DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3
+	vmovdqa	[DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4
+	vmovdqa	[DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5
+	vmovdqa	[DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6
+	vmovdqa	[DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7
+
+	; Restore Y_A and Y_B
+	vmovdqa  Y_A, [rsp + _TMPDIGEST + 0*32]
+	vmovdqa  Y_B, [rsp + _TMPDIGEST + 1*32]
+
+	; Add results to old digest values
+
+	vpaddd	Y_A,Y_A,[Y_AA]
+	vpaddd	Y_B,Y_B,[Y_BB]
+	vpaddd	Y_C,Y_C,[Y_CC]
+	vpaddd	Y_D,Y_D,[Y_DD]
+
+	vpaddd	Y_A2,Y_A2,[Y_AA2]
+	vpaddd	Y_B2,Y_B2,[Y_BB2]
+	vpaddd	Y_C2,Y_C2,[Y_CC2]
+	vpaddd	Y_D2,Y_D2,[Y_DD2]
+
+	; Swap DPTR1 and DPTR2
+	xchg	DPTR1, DPTR2
+
+	;; Proceed to processing of next block
+	jmp 	lloop
+
+lastblock:
+
+	; Perform the 64 rounds of processing ...
+	MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11
+	MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12
+	MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13
+	MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14
+	MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11
+	MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12
+	MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13
+	MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14
+	MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11
+	MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12
+	MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13
+	MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14
+	MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11
+	MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12
+	MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13
+	MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14
+
+	MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21
+	MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22
+	MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23
+	MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24
+	MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21
+	MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22
+	MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23
+	MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24
+	MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21
+	MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22
+	MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23
+	MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24
+	MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21
+	MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22
+	MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23
+	MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24
+
+	MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31
+	MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32
+	MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33
+	MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34
+	MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31
+	MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32
+	MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33
+	MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34
+	MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31
+	MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32
+	MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33
+	MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34
+	MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31
+	MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32
+	MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33
+	MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34
+
+	MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41
+	MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42
+	MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43
+	MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44
+	MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41
+	MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42
+	MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43
+	MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44
+	MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41
+	MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42
+	MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43
+	MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44
+	MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41
+	MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42
+	MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43
+	MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44
+
+        ;; update into data pointers
+%assign I 0
+%rep 8
+        mov    inp0, [state + _data_ptr + (2*I)*8]
+        mov    inp1, [state + _data_ptr + (2*I +1)*8]
+        add    inp0, IDX
+        add    inp1, IDX
+        mov    [state + _data_ptr + (2*I)*8], inp0
+        mov    [state + _data_ptr + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+	vpaddd	Y_A,Y_A,[Y_AA]
+	vpaddd	Y_B,Y_B,[Y_BB]
+	vpaddd	Y_C,Y_C,[Y_CC]
+	vpaddd	Y_D,Y_D,[Y_DD]
+
+	vpaddd	Y_A2,Y_A2,[Y_AA2]
+	vpaddd	Y_B2,Y_B2,[Y_BB2]
+	vpaddd	Y_C2,Y_C2,[Y_CC2]
+	vpaddd	Y_D2,Y_D2,[Y_DD2]
+
+
+
+	vmovdqu	[state + 0*MD5_DIGEST_ROW_SIZE  ],Y_A
+        vmovdqu	[state + 1*MD5_DIGEST_ROW_SIZE  ],Y_B
+        vmovdqu	[state + 2*MD5_DIGEST_ROW_SIZE  ],Y_C
+        vmovdqu	[state + 3*MD5_DIGEST_ROW_SIZE  ],Y_D
+
+
+        vmovdqu	[state + 0*MD5_DIGEST_ROW_SIZE  + 32 ],Y_A2   ;; 32 is YMM width
+        vmovdqu	[state + 1*MD5_DIGEST_ROW_SIZE  + 32 ],Y_B2
+        vmovdqu	[state + 2*MD5_DIGEST_ROW_SIZE  + 32 ],Y_C2
+        vmovdqu	[state + 3*MD5_DIGEST_ROW_SIZE  + 32 ],Y_D2
+
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+
+
+	mov	rsp, [rsp + _RSP_SAVE]
+
+        ret
+
+section .data
+align 64
+MD5_TABLE:
+	dd	0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+	dd	0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+	dd	0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+	dd	0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+	dd	0x242070db, 0x242070db, 0x242070db, 0x242070db
+	dd	0x242070db, 0x242070db, 0x242070db, 0x242070db
+	dd	0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+	dd	0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+	dd	0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+	dd	0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+	dd	0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+	dd	0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+	dd	0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+	dd	0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+	dd	0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+	dd	0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+	dd	0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+	dd	0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+	dd	0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+	dd	0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+	dd	0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+	dd	0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+	dd	0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+	dd	0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+	dd	0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+	dd	0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+	dd	0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+	dd	0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+	dd	0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+	dd	0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+	dd	0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+	dd	0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+	dd	0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+	dd	0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+	dd	0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+	dd	0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+	dd	0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+	dd	0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+	dd	0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+	dd	0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+	dd	0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+	dd	0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+	dd	0x02441453, 0x02441453, 0x02441453, 0x02441453
+	dd	0x02441453, 0x02441453, 0x02441453, 0x02441453
+	dd	0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+	dd	0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+	dd	0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+	dd	0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+	dd	0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+	dd	0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+	dd	0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+	dd	0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+	dd	0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+	dd	0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+	dd	0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+	dd	0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+	dd	0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+	dd	0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+	dd	0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+	dd	0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+	dd	0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+	dd	0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+	dd	0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+	dd	0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+	dd	0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+	dd	0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+	dd	0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+	dd	0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+	dd	0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+	dd	0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+	dd	0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+	dd	0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+	dd	0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+	dd	0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+	dd	0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+	dd	0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+	dd	0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+	dd	0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+	dd	0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+	dd	0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+	dd	0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+	dd	0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+	dd	0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+	dd	0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+	dd	0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+	dd	0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+	dd	0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+	dd	0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+	dd	0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+	dd	0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+	dd	0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+	dd	0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+	dd	0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+	dd	0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+	dd	0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+	dd	0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+	dd	0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+	dd	0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+	dd	0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+	dd	0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+	dd	0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+	dd	0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+	dd	0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+	dd	0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+	dd	0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+	dd	0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+	dd	0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+	dd	0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+	dd	0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+	dd	0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+	dd	0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+	dd	0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+	dd	0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+	dd	0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+	dd	0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+	dd	0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+	dd	0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+	dd	0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+	dd	0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+	dd	0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+	dd	0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+	dd	0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+	dd	0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+	dd	0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+	dd	0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+	dd	0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+	dd	0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
+	dd	0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm
new file mode 100644
index 000000000..6e31d297a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm
@@ -0,0 +1,80 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+default rel
+[bits 64]
+
+; declare the L3 ctx level symbols (these will then call the appropriate
+; L2 symbols)
+extern md5_ctx_mgr_init_sse
+extern md5_ctx_mgr_submit_sse
+extern md5_ctx_mgr_flush_sse
+
+extern md5_ctx_mgr_init_avx
+extern md5_ctx_mgr_submit_avx
+extern md5_ctx_mgr_flush_avx
+
+extern md5_ctx_mgr_init_avx2
+extern md5_ctx_mgr_submit_avx2
+extern md5_ctx_mgr_flush_avx2
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern md5_ctx_mgr_init_avx512
+ extern md5_ctx_mgr_submit_avx512
+ extern md5_ctx_mgr_flush_avx512
+%endif
+
+extern md5_ctx_mgr_init_base
+extern md5_ctx_mgr_submit_base
+extern md5_ctx_mgr_flush_base
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface md5_ctx_mgr_init
+mbin_interface md5_ctx_mgr_submit
+mbin_interface md5_ctx_mgr_flush
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ mbin_dispatch_init6 md5_ctx_mgr_init, md5_ctx_mgr_init_base, md5_ctx_mgr_init_sse, md5_ctx_mgr_init_avx, md5_ctx_mgr_init_avx2, md5_ctx_mgr_init_avx512
+ mbin_dispatch_init6 md5_ctx_mgr_submit, md5_ctx_mgr_submit_base, md5_ctx_mgr_submit_sse, md5_ctx_mgr_submit_avx, md5_ctx_mgr_submit_avx2, md5_ctx_mgr_submit_avx512
+ mbin_dispatch_init6 md5_ctx_mgr_flush, md5_ctx_mgr_flush_base, md5_ctx_mgr_flush_sse, md5_ctx_mgr_flush_avx, md5_ctx_mgr_flush_avx2, md5_ctx_mgr_flush_avx512
+%else
+ mbin_dispatch_init md5_ctx_mgr_init, md5_ctx_mgr_init_sse, md5_ctx_mgr_init_avx, md5_ctx_mgr_init_avx2
+ mbin_dispatch_init md5_ctx_mgr_submit, md5_ctx_mgr_submit_sse, md5_ctx_mgr_submit_avx, md5_ctx_mgr_submit_avx2
+ mbin_dispatch_init md5_ctx_mgr_flush, md5_ctx_mgr_flush_sse, md5_ctx_mgr_flush_avx, md5_ctx_mgr_flush_avx2
+%endif
+
+;;       func                  core, ver, snum
+slversion md5_ctx_mgr_init,	00,   04,  0189
+slversion md5_ctx_mgr_submit,	00,   04,  018a
+slversion md5_ctx_mgr_flush,	00,   04,  018b
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c
new file mode 100644
index 000000000..ed4721107
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c
@@ -0,0 +1,186 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "endian_helper.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference MD5 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+static void OPT_FIX md5_single(const uint8_t * data, uint32_t digest[4]);
+
+#define H0 0x67452301
+#define H1 0xefcdab89
+#define H2 0x98badcfe
+#define H3 0x10325476
+
+void md5_ref(uint8_t * input_data, uint32_t * digest, uint32_t len)
+{
+	uint32_t i, j;
+	uint8_t buf[128];
+
+	digest[0] = H0;
+	digest[1] = H1;
+	digest[2] = H2;
+	digest[3] = H3;
+
+	i = len;
+	while (i >= 64) {
+		md5_single(input_data, digest);
+		input_data += 64;
+		i -= 64;
+	}
+	// 0 <= i < 64
+
+	memcpy(buf, input_data, i);
+	buf[i++] = 0x80;
+	for (j = i; j < 120; j++)
+		buf[j] = 0;
+
+	if (i > 64 - 8)
+		i = 128;
+	else
+		i = 64;
+
+	*(uint64_t *) (buf + i - 8) = to_le64((uint64_t) len * 8);
+
+	md5_single(buf, digest);
+	if (i == 128)
+		md5_single(buf + 64, digest);
+}
+
+#define F1(b,c,d) (d ^ (b & (c ^ d)))
+#define F2(b,c,d) (c ^ (d & (b ^ c)))
+#define F3(b,c,d) (b ^ c ^ d)
+#define F4(b,c,d) (c ^ (b | ~d))
+
+#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r))))
+
+#define step(i,a,b,c,d,f,k,w,r) \
+	if (i < 16) {f = F1(b,c,d); } else \
+	if (i < 32) {f = F2(b,c,d); } else \
+	if (i < 48) {f = F3(b,c,d); } else \
+				{f = F4(b,c,d); } \
+	f = a + f + k + to_le32(w); \
+	a = b + rol32(f, r);
+
+void md5_single(const uint8_t * data, uint32_t digest[4])
+{
+	uint32_t a, b, c, d;
+	uint32_t f;
+	uint32_t *w = (uint32_t *) data;
+
+	a = digest[0];
+	b = digest[1];
+	c = digest[2];
+	d = digest[3];
+
+	step(0, a, b, c, d, f, 0xd76aa478, w[0], 7);
+	step(1, d, a, b, c, f, 0xe8c7b756, w[1], 12);
+	step(2, c, d, a, b, f, 0x242070db, w[2], 17);
+	step(3, b, c, d, a, f, 0xc1bdceee, w[3], 22);
+	step(4, a, b, c, d, f, 0xf57c0faf, w[4], 7);
+	step(5, d, a, b, c, f, 0x4787c62a, w[5], 12);
+	step(6, c, d, a, b, f, 0xa8304613, w[6], 17);
+	step(7, b, c, d, a, f, 0xfd469501, w[7], 22);
+	step(8, a, b, c, d, f, 0x698098d8, w[8], 7);
+	step(9, d, a, b, c, f, 0x8b44f7af, w[9], 12);
+	step(10, c, d, a, b, f, 0xffff5bb1, w[10], 17);
+	step(11, b, c, d, a, f, 0x895cd7be, w[11], 22);
+	step(12, a, b, c, d, f, 0x6b901122, w[12], 7);
+	step(13, d, a, b, c, f, 0xfd987193, w[13], 12);
+	step(14, c, d, a, b, f, 0xa679438e, w[14], 17);
+	step(15, b, c, d, a, f, 0x49b40821, w[15], 22);
+
+	step(16, a, b, c, d, f, 0xf61e2562, w[1], 5);
+	step(17, d, a, b, c, f, 0xc040b340, w[6], 9);
+	step(18, c, d, a, b, f, 0x265e5a51, w[11], 14);
+	step(19, b, c, d, a, f, 0xe9b6c7aa, w[0], 20);
+	step(20, a, b, c, d, f, 0xd62f105d, w[5], 5);
+	step(21, d, a, b, c, f, 0x02441453, w[10], 9);
+	step(22, c, d, a, b, f, 0xd8a1e681, w[15], 14);
+	step(23, b, c, d, a, f, 0xe7d3fbc8, w[4], 20);
+	step(24, a, b, c, d, f, 0x21e1cde6, w[9], 5);
+	step(25, d, a, b, c, f, 0xc33707d6, w[14], 9);
+	step(26, c, d, a, b, f, 0xf4d50d87, w[3], 14);
+	step(27, b, c, d, a, f, 0x455a14ed, w[8], 20);
+	step(28, a, b, c, d, f, 0xa9e3e905, w[13], 5);
+	step(29, d, a, b, c, f, 0xfcefa3f8, w[2], 9);
+	step(30, c, d, a, b, f, 0x676f02d9, w[7], 14);
+	step(31, b, c, d, a, f, 0x8d2a4c8a, w[12], 20);
+
+	step(32, a, b, c, d, f, 0xfffa3942, w[5], 4);
+	step(33, d, a, b, c, f, 0x8771f681, w[8], 11);
+	step(34, c, d, a, b, f, 0x6d9d6122, w[11], 16);
+	step(35, b, c, d, a, f, 0xfde5380c, w[14], 23);
+	step(36, a, b, c, d, f, 0xa4beea44, w[1], 4);
+	step(37, d, a, b, c, f, 0x4bdecfa9, w[4], 11);
+	step(38, c, d, a, b, f, 0xf6bb4b60, w[7], 16);
+	step(39, b, c, d, a, f, 0xbebfbc70, w[10], 23);
+	step(40, a, b, c, d, f, 0x289b7ec6, w[13], 4);
+	step(41, d, a, b, c, f, 0xeaa127fa, w[0], 11);
+	step(42, c, d, a, b, f, 0xd4ef3085, w[3], 16);
+	step(43, b, c, d, a, f, 0x04881d05, w[6], 23);
+	step(44, a, b, c, d, f, 0xd9d4d039, w[9], 4);
+	step(45, d, a, b, c, f, 0xe6db99e5, w[12], 11);
+	step(46, c, d, a, b, f, 0x1fa27cf8, w[15], 16);
+	step(47, b, c, d, a, f, 0xc4ac5665, w[2], 23);
+
+	step(48, a, b, c, d, f, 0xf4292244, w[0], 6);
+	step(49, d, a, b, c, f, 0x432aff97, w[7], 10);
+	step(50, c, d, a, b, f, 0xab9423a7, w[14], 15);
+	step(51, b, c, d, a, f, 0xfc93a039, w[5], 21);
+	step(52, a, b, c, d, f, 0x655b59c3, w[12], 6);
+	step(53, d, a, b, c, f, 0x8f0ccc92, w[3], 10);
+	step(54, c, d, a, b, f, 0xffeff47d, w[10], 15);
+	step(55, b, c, d, a, f, 0x85845dd1, w[1], 21);
+	step(56, a, b, c, d, f, 0x6fa87e4f, w[8], 6);
+	step(57, d, a, b, c, f, 0xfe2ce6e0, w[15], 10);
+	step(58, c, d, a, b, f, 0xa3014314, w[6], 15);
+	step(59, b, c, d, a, f, 0x4e0811a1, w[13], 21);
+	step(60, a, b, c, d, f, 0xf7537e82, w[4], 6);
+	step(61, d, a, b, c, f, 0xbd3af235, w[11], 10);
+	step(62, c, d, a, b, f, 0x2ad7d2bb, w[2], 15);
+	step(63, b, c, d, a, f, 0xeb86d391, w[9], 21);
+
+	digest[0] += a;
+	digest[1] += b;
+	digest[2] += c;
+	digest[3] += d;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/Makefile.am b/src/crypto/isa-l/isa-l_crypto/mh_sha1/Makefile.am
new file mode 100644
index 000000000..696e9c57d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/Makefile.am
@@ -0,0 +1,83 @@
+########################################################################
+#  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_mh_sha1_base = \
+		mh_sha1/mh_sha1_block_base.c \
+		mh_sha1/mh_sha1_finalize_base.c \
+		mh_sha1/mh_sha1_update_base.c \
+		mh_sha1/sha1_for_mh_sha1.c \
+		mh_sha1/mh_sha1.c
+
+lsrc_x86_64  += \
+		$(lsrc_mh_sha1_base) \
+		mh_sha1/mh_sha1_multibinary.asm \
+		mh_sha1/mh_sha1_block_sse.asm \
+		mh_sha1/mh_sha1_block_avx.asm \
+		mh_sha1/mh_sha1_block_avx2.asm \
+		mh_sha1/mh_sha1_block_avx512.asm \
+		mh_sha1/mh_sha1_avx512.c
+
+lsrc_x86_32  += $(lsrc_x86_64)
+
+lsrc_aarch64 += \
+		$(lsrc_mh_sha1_base) \
+		mh_sha1/aarch64/mh_sha1_multibinary.S \
+		mh_sha1/aarch64/mh_sha1_aarch64_dispatcher.c \
+		mh_sha1/aarch64/mh_sha1_block_asimd.S \
+		mh_sha1/aarch64/mh_sha1_asimd.c \
+		mh_sha1/aarch64/mh_sha1_block_ce.S \
+		mh_sha1/aarch64/mh_sha1_ce.c
+
+lsrc_base_aliases += \
+		$(lsrc_mh_sha1_base) \
+		mh_sha1/mh_sha1_base_aliases.c
+
+other_src    += mh_sha1/mh_sha1_ref.c \
+		include/reg_sizes.asm \
+		include/multibinary.asm \
+		include/test.h \
+		mh_sha1/mh_sha1_internal.h
+
+src_include += -I $(srcdir)/mh_sha1
+
+extern_hdrs +=	include/mh_sha1.h
+
+check_tests += 	mh_sha1/mh_sha1_test
+unit_tests  += 	mh_sha1/mh_sha1_update_test
+
+perf_tests  += 	mh_sha1/mh_sha1_perf
+
+
+mh_sha1_test: mh_sha1_ref.o
+mh_sha1_mh_sha1_test_LDADD = mh_sha1/mh_sha1_ref.lo libisal_crypto.la
+
+mh_sha1_update_test: mh_sha1_ref.o
+mh_sha1_mh_sha1_update_test_LDADD = mh_sha1/mh_sha1_ref.lo libisal_crypto.la
+
+mh_sha1_mh_sha1_perf_LDADD = libisal_crypto.la
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_aarch64_dispatcher.c
new file mode 100644
index 000000000..2ad8871fa
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_aarch64_dispatcher.c
@@ -0,0 +1,55 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(mh_sha1_update)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SHA1)
+		return PROVIDER_INFO(mh_sha1_update_ce);
+
+	if (auxval & HWCAP_ASIMD)
+		return PROVIDER_INFO(mh_sha1_update_asimd);
+
+	return PROVIDER_BASIC(mh_sha1_update);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(mh_sha1_finalize)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SHA1)
+		return PROVIDER_INFO(mh_sha1_finalize_ce);
+
+	if (auxval & HWCAP_ASIMD)
+		return PROVIDER_INFO(mh_sha1_finalize_asimd);
+
+	return PROVIDER_BASIC(mh_sha1_finalize);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_asimd.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_asimd.c
new file mode 100644
index 000000000..c913a64df
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_asimd.c
@@ -0,0 +1,53 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <string.h>
+#include "mh_sha1_internal.h"
+
+void mh_sha1_block_asimd(const uint8_t * input_data,
+			 uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+			 uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+/***************mh_sha1_update***********/
+// mh_sha1_update_asimd.c
+#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_asimd
+#define MH_SHA1_BLOCK_FUNCTION	mh_sha1_block_asimd
+#include "mh_sha1_update_base.c"
+#undef MH_SHA1_UPDATE_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+/***************mh_sha1_finalize AND mh_sha1_tail***********/
+// mh_sha1_tail is used to calculate the last incomplete src data block
+// mh_sha1_finalize is a mh_sha1_ctx wrapper of mh_sha1_tail
+// mh_sha1_finalize_asimd.c and mh_sha1_tail_asimd.c
+#define MH_SHA1_FINALIZE_FUNCTION	mh_sha1_finalize_asimd
+#define MH_SHA1_TAIL_FUNCTION		mh_sha1_tail_asimd
+#define MH_SHA1_BLOCK_FUNCTION		mh_sha1_block_asimd
+#include "mh_sha1_finalize_base.c"
+#undef MH_SHA1_FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_asimd.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_asimd.S
new file mode 100644
index 000000000..22f716f27
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_asimd.S
@@ -0,0 +1,124 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+	.arch armv8-a
+
+#include "sha1_asimd_common.S"
+
+.macro load_x4_word idx:req
+	ld1 {WORD\idx\().16b},[segs_ptr]
+	add segs_ptr,segs_ptr,#64
+.endm
+
+/*
+ * void mh_sha1_block_asimd (const uint8_t * input_data,
+ *                           uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ *                           uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ *                           uint32_t num_blocks);
+ * arg 0 pointer to input data
+ * arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+ * arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+ * arg 3 number  of 1KB blocks
+ */
+
+	input_data	.req	x0
+	sha1_digest	.req	x1
+	data_buf	.req	x2
+	num_blocks	.req	w3
+	src	.req	x4
+	dst	.req	x5
+	offs	.req	x6
+	mh_segs	.req	x7
+	tmp	.req	x8
+	segs_ptr	.req	x9
+	block_ctr	.req	w10
+
+	.global mh_sha1_block_asimd
+	.type mh_sha1_block_asimd, %function
+mh_sha1_block_asimd:
+	cmp	num_blocks, #0
+	beq	.return
+	sha1_asimd_save_stack
+
+	mov	mh_segs, #0
+.seg_loops:
+	add	segs_ptr,input_data,mh_segs
+	mov	offs, #64
+	add	src, sha1_digest, mh_segs
+	ld1	{VA.4S}, [src], offs
+	ld1	{VB.4S}, [src], offs
+	ld1	{VC.4S}, [src], offs
+	ld1	{VD.4S}, [src], offs
+	ld1	{VE.4S}, [src], offs
+	mov	block_ctr,num_blocks
+
+.block_loop:
+	sha1_single
+	subs	block_ctr, block_ctr, 1
+	bne	.block_loop
+
+	mov	offs, #64
+	add	dst, sha1_digest, mh_segs
+	st1	{VA.4S}, [dst], offs
+	st1	{VB.4S}, [dst], offs
+	st1	{VC.4S}, [dst], offs
+	st1	{VD.4S}, [dst], offs
+	st1	{VE.4S}, [dst], offs
+
+	add	mh_segs, mh_segs, #16
+	cmp	mh_segs, #64
+	bne	.seg_loops
+
+	sha1_asimd_restore_stack
+.return:
+	ret
+
+	.size mh_sha1_block_asimd, .-mh_sha1_block_asimd
+	.section .rodata.cst16,"aM",@progbits,16
+	.align  16
+KEY_0:
+	.word	0x5a827999
+	.word	0x5a827999
+	.word	0x5a827999
+	.word	0x5a827999
+KEY_1:
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+KEY_2:
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+KEY_3:
+	.word	0xca62c1d6
+	.word	0xca62c1d6
+	.word	0xca62c1d6
+	.word	0xca62c1d6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_ce.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_ce.S
new file mode 100644
index 000000000..12d3c5df2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_ce.S
@@ -0,0 +1,384 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+	.align	2
+	.p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro	declare_var_vector_reg name:req,reg:req
+	\name\()_q	.req	q\reg
+	\name\()_v	.req	v\reg
+	\name\()_s	.req	s\reg
+.endm
+
+
+
+/*
+Variable list
+*/
+
+	declare_var_vector_reg	lane0_msg_0, 0
+	declare_var_vector_reg	lane1_msg_0, 1
+	declare_var_vector_reg	lane2_msg_0, 2
+	declare_var_vector_reg	lane3_msg_0, 3
+	declare_var_vector_reg	lane0_msg_1, 4
+	declare_var_vector_reg	lane1_msg_1, 5
+	declare_var_vector_reg	lane2_msg_1, 6
+	declare_var_vector_reg	lane3_msg_1, 7
+	declare_var_vector_reg	lane0_msg_2, 8
+	declare_var_vector_reg	lane1_msg_2, 9
+	declare_var_vector_reg	lane2_msg_2,10
+	declare_var_vector_reg	lane3_msg_2,11
+	declare_var_vector_reg	lane0_msg_3,12
+	declare_var_vector_reg	lane1_msg_3,13
+	declare_var_vector_reg	lane2_msg_3,14
+	declare_var_vector_reg	lane3_msg_3,15
+
+	declare_var_vector_reg	lane0_abcd ,16
+	declare_var_vector_reg	lane1_abcd ,17
+	declare_var_vector_reg	lane2_abcd ,18
+	declare_var_vector_reg	lane3_abcd ,19
+	declare_var_vector_reg	lane0_tmp0 ,20
+	declare_var_vector_reg	lane1_tmp0 ,21
+	declare_var_vector_reg	lane2_tmp0 ,22
+	declare_var_vector_reg	lane3_tmp0 ,23
+	declare_var_vector_reg	lane0_tmp1 ,24
+	declare_var_vector_reg	lane1_tmp1 ,25
+	declare_var_vector_reg	lane2_tmp1 ,26
+	declare_var_vector_reg	lane3_tmp1 ,27
+
+
+	declare_var_vector_reg	e0	   ,28
+	declare_var_vector_reg	e1	   ,29
+	declare_var_vector_reg	key	   ,30
+	declare_var_vector_reg	tmp	   ,31
+
+	key_adr		.req	x4
+	msg_adr		.req	x5
+	block_cnt	.req	x6
+	offs		.req	x7
+	digest_adr	.req	x16
+	tmp0_adr	.req	x17
+	tmp1_adr	.req	x18
+
+/**
+maros for round 4-67
+*/
+.macro sha1_4_rounds inst:req,msg0:req,msg1:req,msg2:req,msg3:req,abcd:req,e0:req,tmp0:req,e1:req,tmp1:req
+	sha1h	lane0_\tmp0\()_s, lane0_\abcd\()_s
+	sha1h	lane1_\tmp0\()_s, lane1_\abcd\()_s
+	sha1h	lane2_\tmp0\()_s, lane2_\abcd\()_s
+	sha1h	lane3_\tmp0\()_s, lane3_\abcd\()_s
+	mov	\e0\()_v.S[0],lane0_\tmp0\()_v.S[0]
+	mov	\e0\()_v.S[1],lane1_\tmp0\()_v.S[0]
+	mov	\e0\()_v.S[2],lane2_\tmp0\()_v.S[0]
+	mov	\e0\()_v.S[3],lane3_\tmp0\()_v.S[0]
+	mov	lane0_\tmp0\()_v.S[0],\e1\()_v.S[0]
+	mov	lane1_\tmp0\()_v.S[0],\e1\()_v.S[1]
+	mov	lane2_\tmp0\()_v.S[0],\e1\()_v.S[2]
+	mov	lane3_\tmp0\()_v.S[0],\e1\()_v.S[3]
+	\inst	lane0_\abcd\()_q,lane0_\tmp0\()_s,lane0_\tmp1\()_v.4s
+	\inst	lane1_\abcd\()_q,lane1_\tmp0\()_s,lane1_\tmp1\()_v.4s
+	\inst	lane2_\abcd\()_q,lane2_\tmp0\()_s,lane2_\tmp1\()_v.4s
+	\inst	lane3_\abcd\()_q,lane3_\tmp0\()_s,lane3_\tmp1\()_v.4s
+	ld1	{lane0_\tmp0\()_v.4s-lane3_\tmp0\()_v.4s},[\tmp0\()_adr]
+	add 	lane0_\tmp1\()_v.4s,lane0_\msg3\()_v.4s,key_v.4s
+	add 	lane1_\tmp1\()_v.4s,lane1_\msg3\()_v.4s,key_v.4s
+	add 	lane2_\tmp1\()_v.4s,lane2_\msg3\()_v.4s,key_v.4s
+	add 	lane3_\tmp1\()_v.4s,lane3_\msg3\()_v.4s,key_v.4s
+	st1	{lane0_\tmp1\()_v.4s-lane3_\tmp1\()_v.4s},[\tmp1\()_adr]
+	sha1su1	lane0_\msg0\()_v.4s,lane0_\msg3\()_v.4s
+	sha1su1	lane1_\msg0\()_v.4s,lane1_\msg3\()_v.4s
+	sha1su1	lane2_\msg0\()_v.4s,lane2_\msg3\()_v.4s
+	sha1su1	lane3_\msg0\()_v.4s,lane3_\msg3\()_v.4s
+	sha1su0	lane0_\msg1\()_v.4s,lane0_\msg2\()_v.4s,lane0_\msg3\()_v.4s
+	sha1su0	lane1_\msg1\()_v.4s,lane1_\msg2\()_v.4s,lane1_\msg3\()_v.4s
+	sha1su0	lane2_\msg1\()_v.4s,lane2_\msg2\()_v.4s,lane2_\msg3\()_v.4s
+	sha1su0	lane3_\msg1\()_v.4s,lane3_\msg2\()_v.4s,lane3_\msg3\()_v.4s
+
+.endm
+
+
+/*
+	void mh_sha1_block_ce(const uint8_t * input_data,
+	      uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+	      uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks)
+*/
+/*
+Arguements list
+*/
+	input_data 	.req	x0
+	digests		.req	x1
+	frame_buffer	.req	x2
+	num_blocks	.req	w3
+
+	.global	mh_sha1_block_ce
+	.type	mh_sha1_block_ce, %function
+mh_sha1_block_ce:
+	//save temp vector registers
+	stp     d8, d9, [sp, -128]!
+
+	stp     d10, d11, [sp, 16]
+        stp     d12, d13, [sp, 32]
+	stp     d14, d15, [sp, 48]
+	mov	tmp0_adr,frame_buffer
+	add	tmp1_adr,tmp0_adr,128
+
+
+start_loop:
+	mov	block_cnt,0
+	mov	msg_adr,input_data
+lane_loop:
+	mov	offs,64
+	adr	key_adr,KEY_0
+	//load msg 0
+	ld4	{lane0_msg_0_v.S-lane3_msg_0_v.S}[0],[msg_adr],offs
+	ld4	{lane0_msg_0_v.S-lane3_msg_0_v.S}[1],[msg_adr],offs
+	ld4	{lane0_msg_0_v.S-lane3_msg_0_v.S}[2],[msg_adr],offs
+	ld4	{lane0_msg_0_v.S-lane3_msg_0_v.S}[3],[msg_adr],offs
+
+	ld4	{lane0_msg_1_v.S-lane3_msg_1_v.S}[0],[msg_adr],offs
+	ld4	{lane0_msg_1_v.S-lane3_msg_1_v.S}[1],[msg_adr],offs
+	ld4	{lane0_msg_1_v.S-lane3_msg_1_v.S}[2],[msg_adr],offs
+	ld4	{lane0_msg_1_v.S-lane3_msg_1_v.S}[3],[msg_adr],offs
+
+	ld4	{lane0_msg_2_v.S-lane3_msg_2_v.S}[0],[msg_adr],offs
+	ld4	{lane0_msg_2_v.S-lane3_msg_2_v.S}[1],[msg_adr],offs
+	ld4	{lane0_msg_2_v.S-lane3_msg_2_v.S}[2],[msg_adr],offs
+	ld4	{lane0_msg_2_v.S-lane3_msg_2_v.S}[3],[msg_adr],offs
+
+	ld4	{lane0_msg_3_v.S-lane3_msg_3_v.S}[0],[msg_adr],offs
+	ld4	{lane0_msg_3_v.S-lane3_msg_3_v.S}[1],[msg_adr],offs
+	ld4	{lane0_msg_3_v.S-lane3_msg_3_v.S}[2],[msg_adr],offs
+	ld4	{lane0_msg_3_v.S-lane3_msg_3_v.S}[3],[msg_adr],offs
+
+	add	digest_adr,digests,block_cnt
+	ld4	{lane0_abcd_v.S-lane3_abcd_v.S}[0],[digest_adr],offs
+	ld4	{lane0_abcd_v.S-lane3_abcd_v.S}[1],[digest_adr],offs
+	ld4	{lane0_abcd_v.S-lane3_abcd_v.S}[2],[digest_adr],offs
+	ld4	{lane0_abcd_v.S-lane3_abcd_v.S}[3],[digest_adr],offs
+	ldr	e0_q,[digest_adr]
+
+	//load key_0
+	ldr	key_q,[key_adr]
+
+	rev32	lane0_msg_0_v.16b,lane0_msg_0_v.16b
+	rev32	lane1_msg_0_v.16b,lane1_msg_0_v.16b
+	rev32	lane2_msg_0_v.16b,lane2_msg_0_v.16b
+	rev32	lane3_msg_0_v.16b,lane3_msg_0_v.16b
+	rev32	lane0_msg_1_v.16b,lane0_msg_1_v.16b
+	rev32	lane1_msg_1_v.16b,lane1_msg_1_v.16b
+	rev32	lane2_msg_1_v.16b,lane2_msg_1_v.16b
+	rev32	lane3_msg_1_v.16b,lane3_msg_1_v.16b
+	rev32	lane0_msg_2_v.16b,lane0_msg_2_v.16b
+	rev32	lane1_msg_2_v.16b,lane1_msg_2_v.16b
+	rev32	lane2_msg_2_v.16b,lane2_msg_2_v.16b
+	rev32	lane3_msg_2_v.16b,lane3_msg_2_v.16b
+	rev32	lane0_msg_3_v.16b,lane0_msg_3_v.16b
+	rev32	lane1_msg_3_v.16b,lane1_msg_3_v.16b
+	rev32	lane2_msg_3_v.16b,lane2_msg_3_v.16b
+	rev32	lane3_msg_3_v.16b,lane3_msg_3_v.16b
+
+	add	lane0_tmp1_v.4s,lane0_msg_1_v.4s,key_v.4s
+	add	lane1_tmp1_v.4s,lane1_msg_1_v.4s,key_v.4s
+	add	lane2_tmp1_v.4s,lane2_msg_1_v.4s,key_v.4s
+	add	lane3_tmp1_v.4s,lane3_msg_1_v.4s,key_v.4s
+	st1	{lane0_tmp1_v.4s-lane3_tmp1_v.4s},[tmp1_adr]
+
+	add	lane0_tmp0_v.4s,lane0_msg_0_v.4s,key_v.4s
+	add	lane1_tmp0_v.4s,lane1_msg_0_v.4s,key_v.4s
+	add	lane2_tmp0_v.4s,lane2_msg_0_v.4s,key_v.4s
+	add	lane3_tmp0_v.4s,lane3_msg_0_v.4s,key_v.4s
+
+	/* rounds 0-3 */
+	sha1h	lane0_tmp1_s,lane0_abcd_s
+	sha1h	lane1_tmp1_s,lane1_abcd_s
+	sha1h	lane2_tmp1_s,lane2_abcd_s
+	sha1h	lane3_tmp1_s,lane3_abcd_s
+	mov	e1_v.S[0],lane0_tmp1_v.S[0]
+	mov	e1_v.S[1],lane1_tmp1_v.S[0]
+	mov	e1_v.S[2],lane2_tmp1_v.S[0]
+	mov	e1_v.S[3],lane3_tmp1_v.S[0]
+	mov	lane0_tmp1_v.S[0],e0_v.S[0]
+	mov	lane1_tmp1_v.S[0],e0_v.S[1]
+	mov	lane2_tmp1_v.S[0],e0_v.S[2]
+	mov	lane3_tmp1_v.S[0],e0_v.S[3]
+	sha1c	lane0_abcd_q,lane0_tmp1_s,lane0_tmp0_v.4s
+	sha1c	lane1_abcd_q,lane1_tmp1_s,lane1_tmp0_v.4s
+	sha1c	lane2_abcd_q,lane2_tmp1_s,lane2_tmp0_v.4s
+	sha1c	lane3_abcd_q,lane3_tmp1_s,lane3_tmp0_v.4s
+	ld1	{lane0_tmp1_v.4s-lane3_tmp1_v.4s},[tmp1_adr]
+	add	lane0_tmp0_v.4s,lane0_msg_2_v.4s,key_v.4s
+	sha1su0	lane0_msg_0_v.4s,lane0_msg_1_v.4s,lane0_msg_2_v.4s
+	add	lane1_tmp0_v.4s,lane1_msg_2_v.4s,key_v.4s
+	sha1su0	lane1_msg_0_v.4s,lane1_msg_1_v.4s,lane1_msg_2_v.4s
+	add	lane2_tmp0_v.4s,lane2_msg_2_v.4s,key_v.4s
+	sha1su0	lane2_msg_0_v.4s,lane2_msg_1_v.4s,lane2_msg_2_v.4s
+	add	lane3_tmp0_v.4s,lane3_msg_2_v.4s,key_v.4s
+	sha1su0	lane3_msg_0_v.4s,lane3_msg_1_v.4s,lane3_msg_2_v.4s
+	st1	{lane0_tmp0_v.4s-lane3_tmp0_v.4s},[tmp0_adr]
+
+	sha1_4_rounds	sha1c,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1     /* rounds 4-7 */
+	sha1_4_rounds	sha1c,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
+
+
+	adr	key_adr,KEY_1
+	ldr	key_q,[key_adr]
+	sha1_4_rounds	sha1c,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1     /* rounds 12-15 */
+	sha1_4_rounds	sha1c,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
+	sha1_4_rounds	sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1     /* rounds 20-23 */
+	sha1_4_rounds	sha1p,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
+	sha1_4_rounds	sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1
+
+	adr	key_adr,KEY_2
+	ldr	key_q,[key_adr]
+	sha1_4_rounds	sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
+	sha1_4_rounds	sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1     /* rounds 36-39 */
+	sha1_4_rounds	sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
+	sha1_4_rounds	sha1m,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1
+	sha1_4_rounds	sha1m,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
+
+	adr	key_adr,KEY_3
+	ldr	key_q,[key_adr]
+	sha1_4_rounds	sha1m,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1     /* rounds 52-55 */
+	sha1_4_rounds	sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
+	sha1_4_rounds	sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1
+	sha1_4_rounds	sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
+
+	//msg2 and msg1 are free
+	mov	lane0_msg_2_v.S[0],e1_v.S[0]
+	mov	lane1_msg_2_v.S[0],e1_v.S[1]
+	mov	lane2_msg_2_v.S[0],e1_v.S[2]
+	mov	lane3_msg_2_v.S[0],e1_v.S[3]
+
+	/* rounds 68-71 */
+	sha1h	lane0_msg_1_s,lane0_abcd_s
+	sha1h	lane1_msg_1_s,lane1_abcd_s
+	sha1h	lane2_msg_1_s,lane2_abcd_s
+	sha1h	lane3_msg_1_s,lane3_abcd_s
+	sha1p	lane0_abcd_q,lane0_msg_2_s,lane0_tmp1_v.4s
+	sha1p	lane1_abcd_q,lane1_msg_2_s,lane1_tmp1_v.4s
+	sha1p	lane2_abcd_q,lane2_msg_2_s,lane2_tmp1_v.4s
+	sha1p	lane3_abcd_q,lane3_msg_2_s,lane3_tmp1_v.4s
+	add	lane0_tmp1_v.4s,lane0_msg_3_v.4s,key_v.4s
+	add	lane1_tmp1_v.4s,lane1_msg_3_v.4s,key_v.4s
+	add	lane2_tmp1_v.4s,lane2_msg_3_v.4s,key_v.4s
+	add	lane3_tmp1_v.4s,lane3_msg_3_v.4s,key_v.4s
+	sha1su1	lane0_msg_0_v.4s,lane0_msg_3_v.4s
+	sha1su1	lane1_msg_0_v.4s,lane1_msg_3_v.4s
+	sha1su1	lane2_msg_0_v.4s,lane2_msg_3_v.4s
+	sha1su1	lane3_msg_0_v.4s,lane3_msg_3_v.4s
+
+	/* rounds 72-75 */
+	sha1h	lane0_msg_2_s,lane0_abcd_s
+	sha1h	lane1_msg_2_s,lane1_abcd_s
+	sha1h	lane2_msg_2_s,lane2_abcd_s
+	sha1h	lane3_msg_2_s,lane3_abcd_s
+	sha1p	lane0_abcd_q,lane0_msg_1_s,lane0_tmp0_v.4s
+	sha1p	lane1_abcd_q,lane1_msg_1_s,lane1_tmp0_v.4s
+	sha1p	lane2_abcd_q,lane2_msg_1_s,lane2_tmp0_v.4s
+	sha1p	lane3_abcd_q,lane3_msg_1_s,lane3_tmp0_v.4s
+
+	/* rounds 76-79 */
+	sha1h	lane0_msg_1_s,lane0_abcd_s
+	sha1h	lane1_msg_1_s,lane1_abcd_s
+	sha1h	lane2_msg_1_s,lane2_abcd_s
+	sha1h	lane3_msg_1_s,lane3_abcd_s
+	sha1p	lane0_abcd_q,lane0_msg_2_s,lane0_tmp1_v.4s
+	sha1p	lane1_abcd_q,lane1_msg_2_s,lane1_tmp1_v.4s
+	sha1p	lane2_abcd_q,lane2_msg_2_s,lane2_tmp1_v.4s
+	sha1p	lane3_abcd_q,lane3_msg_2_s,lane3_tmp1_v.4s
+	add	digest_adr,digests,block_cnt
+	ld4	{lane0_msg_0_v.S-lane3_msg_0_v.S}[0],[digest_adr],offs
+	ld4	{lane0_msg_0_v.S-lane3_msg_0_v.S}[1],[digest_adr],offs
+	ld4	{lane0_msg_0_v.S-lane3_msg_0_v.S}[2],[digest_adr],offs
+	ld4	{lane0_msg_0_v.S-lane3_msg_0_v.S}[3],[digest_adr],offs
+	ld4	{lane0_msg_3_v.S-lane3_msg_3_v.S}[0],[digest_adr]
+
+	add	lane0_abcd_v.4S,lane0_abcd_v.4S,lane0_msg_0_v.4S
+	add	lane1_abcd_v.4S,lane1_abcd_v.4S,lane1_msg_0_v.4S
+	add	lane2_abcd_v.4S,lane2_abcd_v.4S,lane2_msg_0_v.4S
+	add	lane3_abcd_v.4S,lane3_abcd_v.4S,lane3_msg_0_v.4S
+
+	add	lane0_msg_1_v.4S,lane0_msg_1_v.4S,lane0_msg_3_v.4S
+        add	lane1_msg_1_v.4S,lane1_msg_1_v.4S,lane1_msg_3_v.4S
+        add	lane2_msg_1_v.4S,lane2_msg_1_v.4S,lane2_msg_3_v.4S
+        add	lane3_msg_1_v.4S,lane3_msg_1_v.4S,lane3_msg_3_v.4S
+
+	add	digest_adr,digests,block_cnt
+	st4	{lane0_abcd_v.S-lane3_abcd_v.S}[0],[digest_adr],offs
+	st4	{lane0_abcd_v.S-lane3_abcd_v.S}[1],[digest_adr],offs
+	st4	{lane0_abcd_v.S-lane3_abcd_v.S}[2],[digest_adr],offs
+	st4	{lane0_abcd_v.S-lane3_abcd_v.S}[3],[digest_adr],offs
+	st4	{lane0_msg_1_v.S-lane3_msg_1_v.S}[0],[digest_adr]
+
+	add	block_cnt,block_cnt,16
+	cmp	block_cnt,64
+	add	msg_adr,input_data,block_cnt
+	add	digest_adr,digests,block_cnt
+	bcc	lane_loop
+
+	subs	num_blocks,num_blocks,1
+	add	input_data,input_data,1024
+	bhi	start_loop
+exit_func:
+	//restore temp register
+	ldp     d10, d11, [sp, 16]
+        ldp     d12, d13, [sp, 32]
+        ldp     d14, d15, [sp, 48]
+        ldp     d8, d9, [sp], 128
+	ret
+
+	.size	mh_sha1_block_ce, .-mh_sha1_block_ce
+	.section	.rodata.cst16,"aM",@progbits,16
+	.align	4
+KEY_0:
+	.word	0x5a827999
+	.word	0x5a827999
+	.word	0x5a827999
+	.word	0x5a827999
+KEY_1:
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+KEY_2:
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+KEY_3:
+	.word	0xca62c1d6
+	.word	0xca62c1d6
+	.word	0xca62c1d6
+	.word	0xca62c1d6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_ce.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_ce.c
new file mode 100644
index 000000000..c35daeab0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_ce.c
@@ -0,0 +1,53 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <string.h>
+#include "mh_sha1_internal.h"
+
+void mh_sha1_block_ce(const uint8_t * input_data,
+		      uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+		      uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+/***************mh_sha1_update***********/
+// mh_sha1_update_ce.c
+#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_ce
+#define MH_SHA1_BLOCK_FUNCTION	mh_sha1_block_ce
+#include "mh_sha1_update_base.c"
+#undef MH_SHA1_UPDATE_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+/***************mh_sha1_finalize AND mh_sha1_tail***********/
+// mh_sha1_tail is used to calculate the last incomplete src data block
+// mh_sha1_finalize is a mh_sha1_ctx wrapper of mh_sha1_tail
+// mh_sha1_finalize_ce.c and mh_sha1_tail_ce.c
+#define MH_SHA1_FINALIZE_FUNCTION	mh_sha1_finalize_ce
+#define MH_SHA1_TAIL_FUNCTION		mh_sha1_tail_ce
+#define MH_SHA1_BLOCK_FUNCTION		mh_sha1_block_ce
+#include "mh_sha1_finalize_base.c"
+#undef MH_SHA1_FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_multibinary.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_multibinary.S
new file mode 100644
index 000000000..9a6d0caea
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_multibinary.S
@@ -0,0 +1,35 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include "aarch64_multibinary.h"
+
+
+mbin_interface mh_sha1_update
+mbin_interface mh_sha1_finalize
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/sha1_asimd_common.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/sha1_asimd_common.S
new file mode 100644
index 000000000..c8b8dd982
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/sha1_asimd_common.S
@@ -0,0 +1,269 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+	.arch armv8-a
+
+// macro F = (D ^ (B & (C ^ D)))
+.macro FUNC_F0
+	eor	VF.16b, VC.16b, VD.16b
+	and	VF.16b, VB.16b, VF.16b
+	eor	VF.16b, VD.16b, VF.16b
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F1
+	eor	VF.16b, VB.16b, VC.16b
+	eor	VF.16b, VF.16b, VD.16b
+.endm
+
+// F = ((B & C) | (B & D) | (C & D))
+.macro FUNC_F2
+	and	vT0.16b, VB.16b, VC.16b
+	and	vT1.16b, VB.16b, VD.16b
+	and	vT2.16b, VC.16b, VD.16b
+	orr	VF.16b, vT0.16b, vT1.16b
+	orr	VF.16b, VF.16b, vT2.16b
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F3
+	FUNC_F1
+.endm
+
+.altmacro
+.macro load_next_word windex
+	.if \windex < 16
+		load_x4_word	\windex
+	.endif
+.endm
+
+// FUNC_F0 is merged into STEP_00_15 for efficiency
+.macro SHA1_STEP_00_15_F0 windex:req
+	rev32	WORD\windex\().16b,WORD\windex\().16b
+	next_word=\windex+1
+	load_next_word %next_word
+	// e = (a leftrotate 5) + f + e + k + w[i]
+	ushr	VT.4s, VA.4s, 32 - 5
+	add	VE.4s, VE.4s, VK.4s
+	sli	VT.4s, VA.4s, 5
+	eor	VF.16b, VC.16b, VD.16b
+	add	VE.4s, VE.4s, WORD\windex\().4s
+	and	VF.16b, VB.16b, VF.16b
+	add	VE.4s, VE.4s, VT.4s
+	eor	VF.16b, VD.16b, VF.16b
+	ushr	VT.4s, VB.4s, 32 - 30
+	add	VE.4s, VE.4s, VF.4s
+	sli	VT.4s, VB.4s, 30
+.endm
+
+.macro SHA1_STEP_16_79 windex:req,func_f:req,reg_3:req,reg_8:req,reg_14:req,reg_16:req
+	eor	vT0.16b,\reg_3\().16b,\reg_8\().16b
+	eor	VT.16b,\reg_14\().16b,\reg_16\().16b
+	eor	vT0.16b,vT0.16b,VT.16b
+	// e = (a leftrotate 5) + f + e + k + w[i]
+	ushr	VT.4s, vT0.4s, 32 - 1
+	add	VE.4s, VE.4s, VK.4s
+	ushr	vT1.4s, VA.4s, 32 - 5
+	sli	VT.4s, vT0.4s, 1
+	add	VE.4s, VE.4s, VT.4s
+	sli	vT1.4s, VA.4s, 5
+	mov	\reg_16\().16b,VT.16b
+	add	VE.4s, VE.4s, vT1.4s
+	ushr	VT.4s, VB.4s, 32 - 30
+	\func_f
+	add	VE.4s, VE.4s, VF.4s
+	sli	VT.4s, VB.4s, 30
+.endm
+
+	VA	.req v0
+	VB	.req v1
+	VC	.req v2
+	VD	.req v3
+	VE	.req v4
+	VT	.req v5
+	VF	.req v6
+	VK	.req v7
+	WORD0	.req v8
+	WORD1	.req v9
+	WORD2	.req v10
+	WORD3	.req v11
+	WORD4	.req v12
+	WORD5	.req v13
+	WORD6	.req v14
+	WORD7	.req v15
+	WORD8	.req v16
+	WORD9	.req v17
+	WORD10	.req v18
+	WORD11	.req v19
+	WORD12	.req v20
+	WORD13	.req v21
+	WORD14	.req v22
+	WORD15	.req v23
+	vT0	.req v24
+	vT1	.req v25
+	vT2	.req v26
+	vAA	.req v27
+	vBB	.req v28
+	vCC	.req v29
+	vDD	.req v30
+	vEE	.req v31
+	TT	.req v0
+	sha1key_adr	.req	x15
+
+.macro SWAP_STATES
+	// shifted VB is held in VT after each step
+	.unreq TT
+	TT .req VE
+	.unreq VE
+	VE .req VD
+	.unreq VD
+	VD .req VC
+	.unreq VC
+	VC .req VT
+	.unreq	VT
+	VT .req VB
+	.unreq VB
+	VB .req VA
+	.unreq VA
+	VA .req TT
+.endm
+
+.altmacro
+.macro SHA1_STEP_16_79_WRAPPER windex:req,func_f:req,idx3:req,idx8:req,idx14:req,idx16:req
+	SHA1_STEP_16_79 \windex,\func_f,WORD\idx3\(),WORD\idx8\(),WORD\idx14\(),WORD\idx16\()
+.endm
+
+.macro exec_step windex:req
+	.if \windex <= 15
+		SHA1_STEP_00_15_F0	windex
+	.else
+		idx14=((\windex - 14) & 15)
+		idx8=((\windex - 8) & 15)
+		idx3=((\windex - 3) & 15)
+		idx16=(\windex & 15)
+		.if \windex <= 19
+			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F0,%idx3,%idx8,%idx14,%idx16
+		.endif
+		.if \windex >= 20 && \windex <= 39
+			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F1,%idx3,%idx8,%idx14,%idx16
+		.endif
+		.if \windex >= 40 && \windex <= 59
+			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F2,%idx3,%idx8,%idx14,%idx16
+		.endif
+		.if \windex >= 60 && \windex <= 79
+			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F3,%idx3,%idx8,%idx14,%idx16
+		.endif
+	.endif
+
+	SWAP_STATES
+
+	.if \windex == 79
+		// after 80 steps, the registers ABCDET has shifted from
+		// its orignal order of 012345 to 341520
+		// have to swap back for both compile- and run-time correctness
+		mov	v0.16b,v3.16b
+		.unreq VA
+		VA	.req v0
+
+		mov	vT0.16b,v2.16b
+		mov	v2.16b,v1.16b
+		mov	v1.16b,v4.16b
+		.unreq VB
+		VB	.req v1
+		.unreq VC
+		VC	.req v2
+
+		mov	v3.16b,v5.16b
+		.unreq VD
+		VD	.req v3
+
+		mov	v4.16b,vT0.16b
+		.unreq VE
+		VE	.req v4
+
+		.unreq VT
+		VT	.req v5
+	.endif
+.endm
+
+.macro exec_steps idx:req,more:vararg
+	exec_step	\idx
+	.ifnb \more
+		exec_steps	\more
+	.endif
+.endm
+
+.macro sha1_single
+	load_x4_word 0
+
+	mov	vAA.16B, VA.16B
+	mov	vBB.16B, VB.16B
+	mov	vCC.16B, VC.16B
+	mov	vDD.16B, VD.16B
+	mov	vEE.16B, VE.16B
+
+	adr	sha1key_adr, KEY_0
+	ld1	{VK.4s}, [sha1key_adr]
+	exec_steps	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
+
+	// 20 ~ 39
+	adr	sha1key_adr, KEY_1
+	ld1	{VK.4s}, [sha1key_adr]
+	exec_steps	20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
+
+	// 40 ~ 59
+	adr	sha1key_adr, KEY_2
+	ld1	{VK.4s}, [sha1key_adr]
+	exec_steps	40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
+
+	// 60 ~ 79
+	adr	sha1key_adr, KEY_3
+	ld1	{VK.4s}, [sha1key_adr]
+	exec_steps	60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79
+
+	add	VA.4s, vAA.4s, VA.4s
+	add	VB.4s, vBB.4s, VB.4s
+	add	VC.4s, vCC.4s, VC.4s
+	add	VD.4s, vDD.4s, VD.4s
+	add	VE.4s, vEE.4s, VE.4s
+.endm
+
+.macro sha1_asimd_save_stack
+	stp	d8,d9,[sp, -64]!
+	stp	d10,d11,[sp, 16]
+	stp	d12,d13,[sp, 32]
+	stp	d14,d15,[sp, 48]
+.endm
+
+.macro sha1_asimd_restore_stack
+	ldp	d10,d11,[sp, 16]
+	ldp	d12,d13,[sp, 32]
+	ldp	d14,d15,[sp, 48]
+	ldp	d8,d9,[sp],64
+.endm
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1.c
new file mode 100644
index 000000000..e5d8ad86d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1.c
@@ -0,0 +1,141 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_internal.h"
+
+int mh_sha1_init(struct mh_sha1_ctx *ctx)
+{
+	uint32_t(*mh_sha1_segs_digests)[HASH_SEGS];
+	uint32_t i;
+
+	if (ctx == NULL)
+		return MH_SHA1_CTX_ERROR_NULL;
+
+	memset(ctx, 0, sizeof(*ctx));
+
+	mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests;
+	for (i = 0; i < HASH_SEGS; i++) {
+		mh_sha1_segs_digests[0][i] = MH_SHA1_H0;
+		mh_sha1_segs_digests[1][i] = MH_SHA1_H1;
+		mh_sha1_segs_digests[2][i] = MH_SHA1_H2;
+		mh_sha1_segs_digests[3][i] = MH_SHA1_H3;
+		mh_sha1_segs_digests[4][i] = MH_SHA1_H4;
+	}
+
+	return MH_SHA1_CTX_ERROR_NONE;
+}
+
+#if (!defined(NOARCH)) && (defined(__i386__) || defined(__x86_64__) \
+	|| defined( _M_X64) || defined(_M_IX86))
+/***************mh_sha1_update***********/
+// mh_sha1_update_sse.c
+#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_sse
+#define MH_SHA1_BLOCK_FUNCTION	mh_sha1_block_sse
+#include "mh_sha1_update_base.c"
+#undef MH_SHA1_UPDATE_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+// mh_sha1_update_avx.c
+#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_avx
+#define MH_SHA1_BLOCK_FUNCTION	mh_sha1_block_avx
+#include "mh_sha1_update_base.c"
+#undef MH_SHA1_UPDATE_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+// mh_sha1_update_avx2.c
+#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_avx2
+#define MH_SHA1_BLOCK_FUNCTION	mh_sha1_block_avx2
+#include "mh_sha1_update_base.c"
+#undef MH_SHA1_UPDATE_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+/***************mh_sha1_finalize AND mh_sha1_tail***********/
+// mh_sha1_tail is used to calculate the last incomplete src data block
+// mh_sha1_finalize is a mh_sha1_ctx wrapper of mh_sha1_tail
+
+// mh_sha1_finalize_sse.c and mh_sha1_tail_sse.c
+#define MH_SHA1_FINALIZE_FUNCTION	mh_sha1_finalize_sse
+#define MH_SHA1_TAIL_FUNCTION		mh_sha1_tail_sse
+#define MH_SHA1_BLOCK_FUNCTION		mh_sha1_block_sse
+#include "mh_sha1_finalize_base.c"
+#undef MH_SHA1_FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+// mh_sha1_finalize_avx.c and mh_sha1_tail_avx.c
+#define MH_SHA1_FINALIZE_FUNCTION	mh_sha1_finalize_avx
+#define MH_SHA1_TAIL_FUNCTION		mh_sha1_tail_avx
+#define MH_SHA1_BLOCK_FUNCTION		mh_sha1_block_avx
+#include "mh_sha1_finalize_base.c"
+#undef MH_SHA1_FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+// mh_sha1_finalize_avx2.c and mh_sha1_tail_avx2.c
+#define MH_SHA1_FINALIZE_FUNCTION	mh_sha1_finalize_avx2
+#define MH_SHA1_TAIL_FUNCTION		mh_sha1_tail_avx2
+#define MH_SHA1_BLOCK_FUNCTION		mh_sha1_block_avx2
+#include "mh_sha1_finalize_base.c"
+#undef MH_SHA1_FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+/***************version info***********/
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+// Version info
+struct slver mh_sha1_init_slver_00000271;
+struct slver mh_sha1_init_slver = { 0x0271, 0x00, 0x00 };
+
+// mh_sha1_update version info
+struct slver mh_sha1_update_sse_slver_00000274;
+struct slver mh_sha1_update_sse_slver = { 0x0274, 0x00, 0x00 };
+
+struct slver mh_sha1_update_avx_slver_02000276;
+struct slver mh_sha1_update_avx_slver = { 0x0276, 0x00, 0x02 };
+
+struct slver mh_sha1_update_avx2_slver_04000278;
+struct slver mh_sha1_update_avx2_slver = { 0x0278, 0x00, 0x04 };
+
+// mh_sha1_finalize version info
+struct slver mh_sha1_finalize_sse_slver_00000275;
+struct slver mh_sha1_finalize_sse_slver = { 0x0275, 0x00, 0x00 };
+
+struct slver mh_sha1_finalize_avx_slver_02000277;
+struct slver mh_sha1_finalize_avx_slver = { 0x0277, 0x00, 0x02 };
+
+struct slver mh_sha1_finalize_avx2_slver_04000279;
+struct slver mh_sha1_finalize_avx2_slver = { 0x0279, 0x00, 0x04 };
+
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_avx512.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_avx512.c
new file mode 100644
index 000000000..1305d048f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_avx512.c
@@ -0,0 +1,70 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_internal.h"
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+/***************mh_sha1_update***********/
+// mh_sha1_update_avx512.c
+#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_avx512
+#define MH_SHA1_BLOCK_FUNCTION	mh_sha1_block_avx512
+#include "mh_sha1_update_base.c"
+#undef MH_SHA1_UPDATE_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+/***************mh_sha1_finalize AND mh_sha1_tail***********/
+// mh_sha1_tail is used to calculate the last incomplete src data block
+// mh_sha1_finalize is a mh_sha1_ctx wrapper of mh_sha1_tail
+// mh_sha1_finalize_avx512.c and mh_sha1_tail_avx512.c
+#define MH_SHA1_FINALIZE_FUNCTION	mh_sha1_finalize_avx512
+#define MH_SHA1_TAIL_FUNCTION		mh_sha1_tail_avx512
+#define MH_SHA1_BLOCK_FUNCTION		mh_sha1_block_avx512
+#include "mh_sha1_finalize_base.c"
+#undef MH_SHA1_FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+/***************version info***********/
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+
+// mh_sha1_update version info
+struct slver mh_sha1_update_avx512_slver_0600027c;
+struct slver mh_sha1_update_avx512_slver = { 0x027c, 0x00, 0x06 };
+
+// mh_sha1_finalize version info
+struct slver mh_sha1_finalize_avx512_slver_0600027d;
+struct slver mh_sha1_finalize_avx512_slver = { 0x027d, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_base_aliases.c
new file mode 100644
index 000000000..18cd8161b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_base_aliases.c
@@ -0,0 +1,40 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "mh_sha1_internal.h"
+#include <string.h>
+int mh_sha1_update(struct mh_sha1_ctx *ctx, const void *buffer, uint32_t len)
+{
+	return mh_sha1_update_base(ctx, buffer, len);
+
+}
+
+int mh_sha1_finalize(struct mh_sha1_ctx *ctx, void *mh_sha1_digest)
+{
+	return mh_sha1_finalize_base(ctx, mh_sha1_digest);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx.asm
new file mode 100644
index 000000000..f4b5e76a0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx.asm
@@ -0,0 +1,506 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using AVX
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T   ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    vpxor  %%regF, %%regC,%%regD
+    vpand  %%regF, %%regF,%%regB
+    vpxor  %%regF, %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T   ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    vpxor  %%regF,%%regD,%%regC
+    vpxor  %%regF,%%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T   ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    vpor   %%regF,%%regB,%%regC
+    vpand  %%regT,%%regB,%%regC
+    vpand  %%regF,%%regF,%%regD
+    vpor   %%regF,%%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T   ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpsrld  %%tmp, %%reg, (32-(%%imm))
+	vpslld  %%reg, %%reg, %%imm
+	vpor    %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PROLD_nd reg, imm, tmp, src
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	vpsrld  %%tmp, %%src, (32-(%%imm))
+	vpslld  %%reg, %%src, %%imm
+	vpor    %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 11
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+	vpaddd  %%regE, %%regE,%%immCNT
+	vpaddd  %%regE, %%regE,[%%data + (%%memW * 16)]
+	PROLD_nd        %%regT,5, %%regF,%%regA
+	vpaddd  %%regE, %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD   %%regB,30, %%regT
+	vpaddd  %%regE, %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 11
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+	vpaddd  %%regE, %%regE,%%immCNT
+
+	vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+	vpxor   W16, W16, W14
+	vpxor   W16, W16, [%%data + ((%%memW -  8) & 15) * 16]
+	vpxor   W16, W16, [%%data + ((%%memW -  3) & 15) * 16]
+
+	vpsrld  %%regF, W16, (32-1)
+	vpslld  W16, W16, 1
+	vpor    %%regF, %%regF, W16
+	ROTATE_W
+
+	vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+	vpaddd  %%regE, %%regE,%%regF
+
+	PROLD_nd        %%regT,5, %%regF, %%regA
+	vpaddd  %%regE, %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD   %%regB,30, %%regT
+	vpaddd  %%regE,%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp1  r10
+ %define tmp2  r11
+ %define tmp3  r12		; must be saved and restored
+ %define tmp4  r13		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r10
+ %define arg5   r11
+ %define tmp1   r12		; must be saved and restored
+ %define tmp2   r13		; must be saved and restored
+ %define tmp3   r14		; must be saved and restored
+ %define tmp4   r15		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define return rax
+
+ %define stack_size  10*16 + 7*8		; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	save_reg	rsi,  10*16 + 5*8
+	end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	mov	rsi,  [rsp + 10*16 + 5*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops 		arg3
+;variables of mh_sha1
+%define mh_in_p  	arg0
+%define mh_digests_p 	arg1
+%define mh_data_p	arg2
+%define mh_segs  	tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE	tmp2
+%define FRAMESZ 	4*5*16		;BYTES*DWORDS*SEGS
+
+%define pref		tmp3
+%macro PREFETCH_X 1
+%define %%mem  %1
+	prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS  vmovups
+
+%define A       xmm0
+%define B       xmm1
+%define C       xmm2
+%define D       xmm3
+%define E       xmm4
+%define F       xmm5 ; tmp
+%define G       xmm6 ; tmp
+
+%define TMP     G
+%define FUN     F
+%define K       xmm7
+
+%define AA      xmm8
+%define BB      xmm9
+%define CC      xmm10
+%define DD      xmm11
+%define EE      xmm12
+
+%define T0      xmm6
+%define T1      xmm7
+%define T2      xmm8
+%define T3      xmm9
+%define T4      xmm10
+%define T5      xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14     xmm13
+%define W15     xmm14
+%define W16     xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a  | b  |  c | ...|  p | (16)
+; h0 | h0 | h0 | ...| h0 |    | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 |    | Ba| Bb | Bc |...| Bp |
+; ....
+; h4 | h4 | h4 | ...| h4 |    | Ea| Eb | Ec |...| Ep |
+
+align 32
+
+;void mh_sha1_block_avx(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+;		uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number  of 1KB blocks
+;
+mk_global mh_sha1_block_avx, function, internal
+func(mh_sha1_block_avx)
+	endbranch
+	FUNC_SAVE
+	; save rsp
+	mov	RSP_SAVE, rsp
+
+	cmp	loops, 0
+	jle	.return
+
+	; leave enough space to store segs_digests
+	sub     rsp, FRAMESZ
+	; align rsp to 16 Bytes needed by avx
+	and	rsp, ~0x0F
+
+ %assign I 0					; copy segs_digests into stack
+ %rep 5
+	VMOVPS  A, [mh_digests_p + I*64 + 16*0]
+	VMOVPS  B, [mh_digests_p + I*64 + 16*1]
+	VMOVPS  C, [mh_digests_p + I*64 + 16*2]
+	VMOVPS  D, [mh_digests_p + I*64 + 16*3]
+
+	vmovdqa [rsp + I*64 + 16*0], A
+	vmovdqa [rsp + I*64 + 16*1], B
+	vmovdqa [rsp + I*64 + 16*2], C
+	vmovdqa [rsp + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+
+
+.block_loop:
+	;transform to big-endian data and store on aligned_frame
+	vmovdqa  F, [PSHUFFLE_BYTE_FLIP_MASK]
+	;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4
+ %assign I 0
+ %rep 16
+	VMOVPS   T0,[mh_in_p + I*64+0*16]
+	VMOVPS   T1,[mh_in_p + I*64+1*16]
+	VMOVPS   T2,[mh_in_p + I*64+2*16]
+	VMOVPS   T3,[mh_in_p + I*64+3*16]
+
+	vpshufb  T0, F
+	vmovdqa  [mh_data_p +(I)*16 +0*256],T0
+	vpshufb  T1, F
+	vmovdqa  [mh_data_p +(I)*16 +1*256],T1
+	vpshufb  T2, F
+	vmovdqa  [mh_data_p +(I)*16 +2*256],T2
+	vpshufb  T3, F
+	vmovdqa  [mh_data_p +(I)*16 +3*256],T3
+ %assign I (I+1)
+ %endrep
+
+	mov	mh_segs, 0			;start from the first 4 segments
+	mov	pref, 1024				;avoid prefetch repeadtedly
+ .segs_loop:
+	;; Initialize digests
+	vmovdqa  A, [rsp + 0*64 + mh_segs]
+	vmovdqa  B, [rsp + 1*64 + mh_segs]
+	vmovdqa  C, [rsp + 2*64 + mh_segs]
+	vmovdqa  D, [rsp + 3*64 + mh_segs]
+	vmovdqa  E, [rsp + 4*64 + mh_segs]
+
+	vmovdqa  AA, A
+	vmovdqa  BB, B
+	vmovdqa  CC, C
+	vmovdqa  DD, D
+	vmovdqa  EE, E
+;;
+;; perform 0-79 steps
+;;
+	vmovdqa K, [K00_19]
+;; do rounds 0...15
+ %assign I 0
+ %rep 16
+	SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 16...19
+	vmovdqa W16, [mh_data_p + ((16 - 16) & 15) * 16]
+	vmovdqa W15, [mh_data_p + ((16 - 15) & 15) * 16]
+ %rep 4
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+	PREFETCH_X [mh_in_p + pref+128*0]
+;; do rounds 20...39
+	vmovdqa K, [K20_39]
+ %rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 40...59
+	vmovdqa K, [K40_59]
+ %rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+	PREFETCH_X [mh_in_p + pref+128*1]
+;; do rounds 60...79
+	vmovdqa K, [K60_79]
+ %rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+	vpaddd  A, AA
+	vpaddd  B, BB
+	vpaddd  C, CC
+	vpaddd  D, DD
+	vpaddd  E, EE
+
+	; write out digests
+	vmovdqa  [rsp + 0*64 + mh_segs], A
+	vmovdqa  [rsp + 1*64 + mh_segs], B
+	vmovdqa  [rsp + 2*64 + mh_segs], C
+	vmovdqa  [rsp + 3*64 + mh_segs], D
+	vmovdqa  [rsp + 4*64 + mh_segs], E
+
+	add	pref,      256
+	add	mh_data_p, 256
+	add 	mh_segs,   16
+	cmp	mh_segs,   64
+	jc 	.segs_loop
+
+	sub	mh_data_p, (1024)
+	add 	mh_in_p,   (1024)
+	sub     loops,     1
+	jne     .block_loop
+
+
+ %assign I 0					; copy segs_digests back to mh_digests_p
+ %rep 5
+	vmovdqa A, [rsp + I*64 + 16*0]
+	vmovdqa B, [rsp + I*64 + 16*1]
+	vmovdqa C, [rsp + I*64 + 16*2]
+	vmovdqa D, [rsp + I*64 + 16*3]
+
+	VMOVPS  [mh_digests_p + I*64 + 16*0], A
+	VMOVPS  [mh_digests_p + I*64 + 16*1], B
+	VMOVPS  [mh_digests_p + I*64 + 16*2], C
+	VMOVPS  [mh_digests_p + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+	mov	rsp, RSP_SAVE			; restore rsp
+
+.return:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+K00_19:                  dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39:                  dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59:                  dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79:                  dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx2.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx2.asm
new file mode 100644
index 000000000..fed35d83e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx2.asm
@@ -0,0 +1,508 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using AVX-2
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; Magic functions defined in FIPS 180-1
+;;
+;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | ((~ B) & D) )
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    vpand  %%regF, %%regB,%%regC
+    vpandn %%regT, %%regB,%%regD
+    vpor   %%regF, %%regT,%%regF
+%endmacro
+
+;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    vpxor  %%regF,%%regD,%%regC
+    vpxor  %%regF,%%regF,%%regB
+%endmacro
+
+
+
+;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    vpor   %%regF,%%regB,%%regC
+    vpand  %%regT,%%regB,%%regC
+    vpand  %%regF,%%regF,%%regD
+    vpor   %%regF,%%regF,%%regT
+%endmacro
+
+;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpsrld	%%tmp, %%reg, (32-%%imm)
+	vpslld	%%reg, %%reg, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	vpsrld	%%tmp, %%src, (32-%%imm)
+	vpslld	%%reg, %%src, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 11
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+	vpaddd	%%regE, %%regE,%%immCNT
+	vpaddd	%%regE, %%regE,[%%data + (%%memW * 32)]
+	PROLD_nd	%%regT,5, %%regF,%%regA
+	vpaddd	%%regE, %%regE,%%regT
+	%%MAGIC	%%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD	%%regB,30, %%regT
+	vpaddd	%%regE, %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 11
+%define %%regA	%1
+%define %%regB	%2
+%define %%regC	%3
+%define %%regD	%4
+%define %%regE	%5
+%define %%regT	%6
+%define %%regF	%7
+%define %%memW	%8
+%define %%immCNT %9
+%define %%MAGIC	%10
+%define %%data %11
+	vpaddd	%%regE, %%regE,%%immCNT
+
+	vmovdqa	W14, [%%data + ((%%memW - 14) & 15) * 32]
+	vpxor	W16, W16, W14
+	vpxor	W16, W16, [%%data + ((%%memW -  8) & 15) * 32]
+	vpxor	W16, W16, [%%data + ((%%memW -  3) & 15) * 32]
+
+	vpsrld	%%regF, W16, (32-1)
+	vpslld	W16, W16, 1
+	vpor	%%regF, %%regF, W16
+	ROTATE_W
+
+	vmovdqa	[%%data + ((%%memW - 0) & 15) * 32],%%regF
+	vpaddd	%%regE, %%regE,%%regF
+
+	PROLD_nd	%%regT,5, %%regF, %%regA
+	vpaddd	%%regE, %%regE,%%regT
+	%%MAGIC	%%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD	%%regB,30, %%regT
+	vpaddd	%%regE,%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp1  r10
+ %define tmp2  r11
+ %define tmp3  r12		; must be saved and restored
+ %define tmp4  r13		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r10
+ %define arg5   r11
+ %define tmp1   r12		; must be saved and restored
+ %define tmp2   r13		; must be saved and restored
+ %define tmp3   r14		; must be saved and restored
+ %define tmp4   r15		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define return rax
+
+ %define stack_size  10*16 + 7*8		; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	save_reg	rsi,  10*16 + 5*8
+	end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	mov	rsi,  [rsp + 10*16 + 5*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops 		arg3
+;variables of mh_sha1
+%define mh_in_p  	arg0
+%define mh_digests_p 	arg1
+%define mh_data_p	arg2
+%define mh_segs  	tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE	tmp2
+%define FRAMESZ 	4*5*16		;BYTES*DWORDS*SEGS
+
+%define pref		tmp3
+%macro PREFETCH_X 1
+%define %%mem  %1
+	prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS	vmovups
+
+%define A	ymm0
+%define B	ymm1
+%define C	ymm2
+%define D	ymm3
+%define E	ymm4
+
+%define F	ymm5
+%define T0	ymm6
+%define T1	ymm7
+%define T2	ymm8
+%define T3	ymm9
+%define T4	ymm10
+%define T5	ymm11
+%define T6	ymm12
+%define T7	ymm13
+%define T8	ymm14
+%define T9	ymm15
+
+%define AA	ymm5
+%define BB	ymm6
+%define CC	ymm7
+%define DD	ymm8
+%define EE	ymm9
+%define TMP	ymm10
+%define FUN	ymm11
+%define K	ymm12
+%define W14	ymm13
+%define W15	ymm14
+%define W16	ymm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a  | b  |  c | ...|  p | (16)
+; h0 | h0 | h0 | ...| h0 |    | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 |    | Ba| Bb | Bc |...| Bp |
+; ....
+; h4 | h4 | h4 | ...| h4 |    | Ea| Eb | Ec |...| Ep |
+
+align 32
+
+;void mh_sha1_block_avx2(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+;		uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number  of 1KB blocks
+;
+mk_global mh_sha1_block_avx2, function, internal
+func(mh_sha1_block_avx2)
+	endbranch
+	FUNC_SAVE
+
+	; save rsp
+	mov	RSP_SAVE, rsp
+
+	test	loops, loops
+	jz	.return
+
+	; leave enough space to store segs_digests
+	sub     rsp, FRAMESZ
+	; align rsp to 32 Bytes needed by avx2
+	and	rsp, ~0x1F
+
+ %assign I 0					; copy segs_digests into stack
+ %rep 2
+	VMOVPS  A, [mh_digests_p + I*32*5 + 32*0]
+	VMOVPS  B, [mh_digests_p + I*32*5 + 32*1]
+	VMOVPS  C, [mh_digests_p + I*32*5 + 32*2]
+	VMOVPS  D, [mh_digests_p + I*32*5 + 32*3]
+	VMOVPS  E, [mh_digests_p + I*32*5 + 32*4]
+
+	vmovdqa [rsp + I*32*5 + 32*0], A
+	vmovdqa [rsp + I*32*5 + 32*1], B
+	vmovdqa [rsp + I*32*5 + 32*2], C
+	vmovdqa [rsp + I*32*5 + 32*3], D
+	vmovdqa [rsp + I*32*5 + 32*4], E
+ %assign I (I+1)
+ %endrep
+
+.block_loop:
+	;transform to big-endian data and store on aligned_frame
+	vbroadcasti128 F, [PSHUFFLE_BYTE_FLIP_MASK]
+	;transform input data from DWORD*16_SEGS*5 to DWORD*8_SEGS*5*2
+%assign I 0
+%rep 16
+	VMOVPS   T0,[mh_in_p + I*64+0*32]
+	VMOVPS   T1,[mh_in_p + I*64+1*32]
+
+	vpshufb	T0, T0, F
+	vmovdqa	[mh_data_p +I*32+0*512],T0
+	vpshufb	T1, T1, F
+	vmovdqa	[mh_data_p +I*32+1*512],T1
+%assign I (I+1)
+%endrep
+
+	xor	mh_segs, mh_segs			;start from the first 8 segments
+	mov	pref, 1024				;avoid prefetch repeadtedly
+ .segs_loop:
+	;; Initialize digests
+	vmovdqa	A, [rsp + 0*64 + mh_segs]
+	vmovdqa	B, [rsp + 1*64 + mh_segs]
+	vmovdqa	C, [rsp + 2*64 + mh_segs]
+	vmovdqa	D, [rsp + 3*64 + mh_segs]
+	vmovdqa	E, [rsp + 4*64 + mh_segs]
+
+	vmovdqa  AA, A
+	vmovdqa  BB, B
+	vmovdqa  CC, C
+	vmovdqa  DD, D
+	vmovdqa  EE, E
+;;
+;; perform 0-79 steps
+;;
+	vpbroadcastq	K, [K00_19]
+;; do rounds 0...15
+ %assign I 0
+ %rep 16
+	SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+	vmovdqa	W16, [mh_data_p + ((16 - 16) & 15) * 32]
+	vmovdqa	W15, [mh_data_p + ((16 - 15) & 15) * 32]
+ %rep 4
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+	PREFETCH_X [mh_in_p + pref+128*0]
+	PREFETCH_X [mh_in_p + pref+128*1]
+;; do rounds 20...39
+	vpbroadcastq	K, [K20_39]
+ %rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+;; do rounds 40...59
+	vpbroadcastq	K, [K40_59]
+ %rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+	PREFETCH_X [mh_in_p + pref+128*2]
+        PREFETCH_X [mh_in_p + pref+128*3]
+;; do rounds 60...79
+	vpbroadcastq	K, [K60_79]
+ %rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+	vpaddd  A,A, AA
+	vpaddd  B,B, BB
+	vpaddd  C,C, CC
+	vpaddd  D,D, DD
+	vpaddd  E,E, EE
+
+	; write out digests
+	vmovdqa  [rsp + 0*64 + mh_segs], A
+	vmovdqa  [rsp + 1*64 + mh_segs], B
+	vmovdqa  [rsp + 2*64 + mh_segs], C
+	vmovdqa  [rsp + 3*64 + mh_segs], D
+	vmovdqa  [rsp + 4*64 + mh_segs], E
+
+	add	pref,      512
+
+	add	mh_data_p, 512
+	add 	mh_segs,   32
+	cmp	mh_segs,   64
+	jc 	.segs_loop
+
+	sub	mh_data_p, (1024)
+	add 	mh_in_p,   (1024)
+	sub     loops, 1
+	jne     .block_loop
+
+
+ %assign I 0					; copy segs_digests back to mh_digests_p
+ %rep 2
+	vmovdqa A, [rsp + I*32*5 + 32*0]
+	vmovdqa B, [rsp + I*32*5 + 32*1]
+	vmovdqa C, [rsp + I*32*5 + 32*2]
+	vmovdqa D, [rsp + I*32*5 + 32*3]
+	vmovdqa E, [rsp + I*32*5 + 32*4]
+
+	VMOVPS  [mh_digests_p + I*32*5 + 32*0], A
+	VMOVPS  [mh_digests_p + I*32*5 + 32*1], B
+	VMOVPS  [mh_digests_p + I*32*5 + 32*2], C
+	VMOVPS  [mh_digests_p + I*32*5 + 32*3], D
+	VMOVPS  [mh_digests_p + I*32*5 + 32*4], E
+ %assign I (I+1)
+ %endrep
+	mov	rsp, RSP_SAVE			; restore rsp
+
+.return:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .rodata align=32
+
+align 32
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19:			dq 0x5A8279995A827999
+K20_39:                 dq 0x6ED9EBA16ED9EBA1
+K40_59:                 dq 0x8F1BBCDC8F1BBCDC
+K60_79:                 dq 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx512.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx512.asm
new file mode 100644
index 000000000..a72c21661
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx512.asm
@@ -0,0 +1,406 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using AVX-512
+;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS	vmovdqu64
+;SIMD variables definition
+%define A	zmm0
+%define B	zmm1
+%define C	zmm2
+%define D	zmm3
+%define E	zmm4
+%define HH0	zmm5
+%define HH1	zmm6
+%define HH2	zmm7
+%define HH3	zmm8
+%define HH4	zmm9
+%define KT	zmm10
+%define XTMP0	zmm11
+%define XTMP1	zmm12
+%define SHUF_MASK	zmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;using extra 16 ZMM registers to place the inverse input data
+%define W0	zmm16
+%define W1	zmm17
+%define W2	zmm18
+%define W3	zmm19
+%define W4	zmm20
+%define W5	zmm21
+%define W6	zmm22
+%define W7	zmm23
+%define W8	zmm24
+%define W9	zmm25
+%define W10	zmm26
+%define W11	zmm27
+%define W12	zmm28
+%define W13	zmm29
+%define W14	zmm30
+%define W15	zmm31
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;macros definition
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro PROCESS_LOOP 2
+%define %%WT		%1
+%define %%F_IMMED	%2
+
+	; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt
+	; E=D, D=C, C=ROTL_30(B), B=A, A=T
+
+	; Ft
+	;  0-19 Ch(B,C,D) = (B&C) ^ (~B&D)
+	; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D
+	; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D)
+
+	vmovdqa32	XTMP1, B		; Copy B
+	vpaddd		E, E, %%WT		; E = E + Wt
+	vpternlogd	XTMP1, C, D, %%F_IMMED	; TMP1 = Ft(B,C,D)
+	vpaddd		E, E, KT		; E = E + Wt + Kt
+	vprold		XTMP0, A, 5		; TMP0 = ROTL_5(A)
+	vpaddd		E, E, XTMP1		; E = Ft(B,C,D) + E + Kt + Wt
+	vprold		B, B, 30		; B = ROTL_30(B)
+	vpaddd		E, E, XTMP0		; E = T
+
+	ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_79 4
+%define %%WT	%1
+%define %%WTp2	%2
+%define %%WTp8	%3
+%define %%WTp13	%4
+	; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16)
+	; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt)
+	vpternlogd	%%WT, %%WTp2, %%WTp8, 0x96
+	vpxord		%%WT, %%WT, %%WTp13
+	vprold		%%WT, %%WT, 1
+%endmacro
+
+%define APPEND(a,b) a %+ b
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp1  r10
+ %define tmp2  r11
+ %define tmp3  r12		; must be saved and restored
+ %define tmp4  r13		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r10
+ %define arg5   r11
+ %define tmp1   r12		; must be saved and restored
+ %define tmp2   r13		; must be saved and restored
+ %define tmp3   r14		; must be saved and restored
+ %define tmp4   r15		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define return rax
+
+ %define stack_size  10*16 + 7*8		; must be an odd multiple of 8
+ ; remove unwind info macros
+ %define func(x) x: endbranch
+ %macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp + 0*16], xmm6
+	movdqa	[rsp + 1*16], xmm7
+	movdqa	[rsp + 2*16], xmm8
+	movdqa	[rsp + 3*16], xmm9
+	movdqa	[rsp + 4*16], xmm10
+	movdqa	[rsp + 5*16], xmm11
+	movdqa	[rsp + 6*16], xmm12
+	movdqa	[rsp + 7*16], xmm13
+	movdqa	[rsp + 8*16], xmm14
+	movdqa	[rsp + 9*16], xmm15
+	mov	[rsp + 10*16 + 0*8], r12
+	mov	[rsp + 10*16 + 1*8], r13
+	mov	[rsp + 10*16 + 2*8], r14
+	mov	[rsp + 10*16 + 3*8], r15
+	mov	[rsp + 10*16 + 4*8], rdi
+	mov	[rsp + 10*16 + 5*8], rsi
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	mov	rsi,  [rsp + 10*16 + 5*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops 		arg3
+;variables of mh_sha1
+%define mh_in_p  	arg0
+%define mh_digests_p 	arg1
+%define mh_data_p	arg2
+%define mh_segs  	tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE	tmp2
+
+%define pref		tmp3
+%macro PREFETCH_X 1
+%define %%mem  %1
+	prefetchnta %%mem
+%endmacro
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a  | b  |  c | ...|  p | (16)
+; h0 | h0 | h0 | ...| h0 |    | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 |    | Ba| Bb | Bc |...| Bp |
+; ....
+; h4 | h4 | h4 | ...| h4 |    | Ea| Eb | Ec |...| Ep |
+
+[bits 64]
+section .text
+align 32
+
+;void mh_sha1_block_avx512(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+;		uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number  of 1KB blocks
+;
+global mh_sha1_block_avx512
+func(mh_sha1_block_avx512)
+	endbranch
+	FUNC_SAVE
+
+	; save rsp
+	mov	RSP_SAVE, rsp
+
+	cmp	loops, 0
+	jle	.return
+
+	; align rsp to 64 Bytes needed by avx512
+	and	rsp, ~0x3f
+
+	; copy segs_digests into registers.
+	VMOVPS  HH0, [mh_digests_p + 64*0]
+	VMOVPS  HH1, [mh_digests_p + 64*1]
+	VMOVPS  HH2, [mh_digests_p + 64*2]
+	VMOVPS  HH3, [mh_digests_p + 64*3]
+	VMOVPS  HH4, [mh_digests_p + 64*4]
+	;a mask used to transform to big-endian data
+	vmovdqa64 SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK]
+
+.block_loop:
+	;transform to big-endian data and store on aligned_frame
+	;using extra 16 ZMM registers instead of stack
+%assign I 0
+%rep 8
+%assign J (I+1)
+	VMOVPS	APPEND(W,I),[mh_in_p + I*64+0*64]
+	VMOVPS	APPEND(W,J),[mh_in_p + I*64+1*64]
+
+	vpshufb	APPEND(W,I), APPEND(W,I), SHUF_MASK
+	vpshufb	APPEND(W,J), APPEND(W,J), SHUF_MASK
+%assign I (I+2)
+%endrep
+
+	vmovdqa64  A, HH0
+	vmovdqa64  B, HH1
+	vmovdqa64  C, HH2
+	vmovdqa64  D, HH3
+	vmovdqa64  E, HH4
+
+	vmovdqa32	KT, [K00_19]
+%assign I 0xCA
+%assign J 0
+%assign K 2
+%assign L 8
+%assign M 13
+%assign N 0
+%rep 80
+	PROCESS_LOOP  APPEND(W,J),  I
+	%if N < 64
+	MSG_SCHED_ROUND_16_79  APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+	%endif
+	%if N = 19
+		vmovdqa32	KT, [K20_39]
+		%assign I 0x96
+	%elif N = 39
+		vmovdqa32	KT, [K40_59]
+		%assign I 0xE8
+	%elif N = 59
+		vmovdqa32	KT, [K60_79]
+		%assign I 0x96
+	%endif
+	%if N % 10 = 9
+		PREFETCH_X [mh_in_p + 1024+128*(N / 10)]
+	%endif
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%assign N (N+1)
+%endrep
+
+	; Add old digest
+	vpaddd  HH0,A, HH0
+	vpaddd  HH1,B, HH1
+	vpaddd  HH2,C, HH2
+	vpaddd  HH3,D, HH3
+	vpaddd  HH4,E, HH4
+
+	add 	mh_in_p,   1024
+	sub     loops, 1
+	jne     .block_loop
+
+	; copy segs_digests to mh_digests_p
+	VMOVPS  [mh_digests_p + 64*0], HH0
+	VMOVPS  [mh_digests_p + 64*1], HH1
+	VMOVPS  [mh_digests_p + 64*2], HH2
+	VMOVPS  [mh_digests_p + 64*3], HH3
+	VMOVPS  [mh_digests_p + 64*4], HH4
+
+	mov	rsp, RSP_SAVE			; restore rsp
+
+.return:
+	FUNC_RESTORE
+	ret
+
+
+section .data align=64
+
+align 64
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203
+			 dq 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203
+			 dq 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203
+			 dq 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203
+			 dq 0x0c0d0e0f08090a0b
+
+K00_19:			dq 0x5A8279995A827999
+			dq 0x5A8279995A827999
+			dq 0x5A8279995A827999
+			dq 0x5A8279995A827999
+			dq 0x5A8279995A827999
+			dq 0x5A8279995A827999
+			dq 0x5A8279995A827999
+			dq 0x5A8279995A827999
+
+K20_39:			dq  0x6ED9EBA16ED9EBA1
+			dq  0x6ED9EBA16ED9EBA1
+			dq  0x6ED9EBA16ED9EBA1
+			dq  0x6ED9EBA16ED9EBA1
+			dq  0x6ED9EBA16ED9EBA1
+			dq  0x6ED9EBA16ED9EBA1
+			dq  0x6ED9EBA16ED9EBA1
+			dq  0x6ED9EBA16ED9EBA1
+
+K40_59:			dq  0x8F1BBCDC8F1BBCDC
+			dq  0x8F1BBCDC8F1BBCDC
+			dq  0x8F1BBCDC8F1BBCDC
+			dq  0x8F1BBCDC8F1BBCDC
+			dq  0x8F1BBCDC8F1BBCDC
+			dq  0x8F1BBCDC8F1BBCDC
+			dq  0x8F1BBCDC8F1BBCDC
+			dq  0x8F1BBCDC8F1BBCDC
+
+K60_79:			dq  0xCA62C1D6CA62C1D6
+			dq  0xCA62C1D6CA62C1D6
+			dq  0xCA62C1D6CA62C1D6
+			dq  0xCA62C1D6CA62C1D6
+			dq  0xCA62C1D6CA62C1D6
+			dq  0xCA62C1D6CA62C1D6
+			dq  0xCA62C1D6CA62C1D6
+			dq  0xCA62C1D6CA62C1D6
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_mh_sha1_block_avx512
+no_mh_sha1_block_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_base.c
new file mode 100644
index 000000000..402c9741a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_base.c
@@ -0,0 +1,387 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "mh_sha1_internal.h"
+#include <string.h>
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Base multi-hash SHA1 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+#define store_w(s, i, w, ww) (w[i][s] = to_be32(ww[i*HASH_SEGS+s]))	// only used for step 0 ~ 15
+#define update_w(s, i, w) (w[i&15][s] = rol32(w[(i-3)&15][s]^w[(i-8)&15][s]^w[(i-14)&15][s]^w[(i-16)&15][s], 1))	// used for step > 15
+#define update_e_1(s, a, b, c, d, e, i, w)  (e[s] += rol32(a[s],5) + F1(b[s],c[s],d[s]) + K_00_19 + w[i&15][s])
+#define update_e_2(s, a, b, c, d, e, i, w)  (e[s] += rol32(a[s],5) + F2(b[s],c[s],d[s]) + K_20_39 + w[i&15][s])
+#define update_e_3(s, a, b, c, d, e, i, w)  (e[s] += rol32(a[s],5) + F3(b[s],c[s],d[s]) + K_40_59 + w[i&15][s])
+#define update_e_4(s, a, b, c, d, e, i, w)  (e[s] += rol32(a[s],5) + F4(b[s],c[s],d[s]) + K_60_79 + w[i&15][s])
+#define update_b(s, b)  (b[s] = rol32(b[s],30))
+
+#define STORE_W(i, w, ww)			\
+  store_w(0, i, w, ww);				\
+  store_w(1, i, w, ww);				\
+  store_w(2, i, w, ww);				\
+  store_w(3, i, w, ww);				\
+  store_w(4, i, w, ww);				\
+  store_w(5, i, w, ww);				\
+  store_w(6, i, w, ww);				\
+  store_w(7, i, w, ww);				\
+  store_w(8, i, w, ww);				\
+  store_w(9, i, w, ww);				\
+  store_w(10, i, w, ww);			\
+  store_w(11, i, w, ww);			\
+  store_w(12, i, w, ww);			\
+  store_w(13, i, w, ww);			\
+  store_w(14, i, w, ww);			\
+  store_w(15, i, w, ww)
+
+#define UPDATE_W(i, w)				\
+  update_w(0, i, w);				\
+  update_w(1, i, w);				\
+  update_w(2, i, w);				\
+  update_w(3, i, w);				\
+  update_w(4, i, w);				\
+  update_w(5, i, w);				\
+  update_w(6, i, w);				\
+  update_w(7, i, w);				\
+  update_w(8, i, w);				\
+  update_w(9, i, w);				\
+  update_w(10, i, w);				\
+  update_w(11, i, w);				\
+  update_w(12, i, w);				\
+  update_w(13, i, w);				\
+  update_w(14, i, w);				\
+  update_w(15, i, w)
+
+#define UPDATE_E1(a, b, c, d, e, i, w)		\
+  update_e_1(0, a, b, c, d, e, i, w);		\
+  update_e_1(1, a, b, c, d, e, i, w);		\
+  update_e_1(2, a, b, c, d, e, i, w);		\
+  update_e_1(3, a, b, c, d, e, i, w);		\
+  update_e_1(4, a, b, c, d, e, i, w);		\
+  update_e_1(5, a, b, c, d, e, i, w);		\
+  update_e_1(6, a, b, c, d, e, i, w);		\
+  update_e_1(7, a, b, c, d, e, i, w);		\
+  update_e_1(8, a, b, c, d, e, i, w);		\
+  update_e_1(9, a, b, c, d, e, i, w);		\
+  update_e_1(10, a, b, c, d, e, i, w);		\
+  update_e_1(11, a, b, c, d, e, i, w);		\
+  update_e_1(12, a, b, c, d, e, i, w);		\
+  update_e_1(13, a, b, c, d, e, i, w);		\
+  update_e_1(14, a, b, c, d, e, i, w);		\
+  update_e_1(15, a, b, c, d, e, i, w)
+
+#define UPDATE_E2(a, b, c, d, e, i, w)		\
+  update_e_2(0, a, b, c, d, e, i, w);		\
+  update_e_2(1, a, b, c, d, e, i, w);		\
+  update_e_2(2, a, b, c, d, e, i, w);		\
+  update_e_2(3, a, b, c, d, e, i, w);		\
+  update_e_2(4, a, b, c, d, e, i, w);		\
+  update_e_2(5, a, b, c, d, e, i, w);		\
+  update_e_2(6, a, b, c, d, e, i, w);		\
+  update_e_2(7, a, b, c, d, e, i, w);		\
+  update_e_2(8, a, b, c, d, e, i, w);		\
+  update_e_2(9, a, b, c, d, e, i, w);		\
+  update_e_2(10, a, b, c, d, e, i, w);		\
+  update_e_2(11, a, b, c, d, e, i, w);		\
+  update_e_2(12, a, b, c, d, e, i, w);		\
+  update_e_2(13, a, b, c, d, e, i, w);		\
+  update_e_2(14, a, b, c, d, e, i, w);		\
+  update_e_2(15, a, b, c, d, e, i, w)
+
+#define UPDATE_E3(a, b, c, d, e, i, w)		\
+  update_e_3(0, a, b, c, d, e, i, w);		\
+  update_e_3(1, a, b, c, d, e, i, w);		\
+  update_e_3(2, a, b, c, d, e, i, w);		\
+  update_e_3(3, a, b, c, d, e, i, w);		\
+  update_e_3(4, a, b, c, d, e, i, w);		\
+  update_e_3(5, a, b, c, d, e, i, w);		\
+  update_e_3(6, a, b, c, d, e, i, w);		\
+  update_e_3(7, a, b, c, d, e, i, w);		\
+  update_e_3(8, a, b, c, d, e, i, w);		\
+  update_e_3(9, a, b, c, d, e, i, w);		\
+  update_e_3(10, a, b, c, d, e, i, w);		\
+  update_e_3(11, a, b, c, d, e, i, w);		\
+  update_e_3(12, a, b, c, d, e, i, w);		\
+  update_e_3(13, a, b, c, d, e, i, w);		\
+  update_e_3(14, a, b, c, d, e, i, w);		\
+  update_e_3(15, a, b, c, d, e, i, w)
+
+#define UPDATE_E4(a, b, c, d, e, i, w)		\
+  update_e_4(0, a, b, c, d, e, i, w);		\
+  update_e_4(1, a, b, c, d, e, i, w);		\
+  update_e_4(2, a, b, c, d, e, i, w);		\
+  update_e_4(3, a, b, c, d, e, i, w);		\
+  update_e_4(4, a, b, c, d, e, i, w);		\
+  update_e_4(5, a, b, c, d, e, i, w);		\
+  update_e_4(6, a, b, c, d, e, i, w);		\
+  update_e_4(7, a, b, c, d, e, i, w);		\
+  update_e_4(8, a, b, c, d, e, i, w);		\
+  update_e_4(9, a, b, c, d, e, i, w);		\
+  update_e_4(10, a, b, c, d, e, i, w);		\
+  update_e_4(11, a, b, c, d, e, i, w);		\
+  update_e_4(12, a, b, c, d, e, i, w);		\
+  update_e_4(13, a, b, c, d, e, i, w);		\
+  update_e_4(14, a, b, c, d, e, i, w);		\
+  update_e_4(15, a, b, c, d, e, i, w)
+
+#define UPDATE_B(b)				\
+  update_b(0, b);				\
+  update_b(1, b);				\
+  update_b(2, b);				\
+  update_b(3, b);				\
+  update_b(4, b);				\
+  update_b(5, b);				\
+  update_b(6, b);				\
+  update_b(7, b);				\
+  update_b(8, b);				\
+  update_b(9, b);				\
+  update_b(10, b);				\
+  update_b(11, b);				\
+  update_b(12, b);				\
+  update_b(13, b);				\
+  update_b(14, b);				\
+  update_b(15, b)
+
+static inline void step00_15(int i, uint32_t * a, uint32_t * b, uint32_t * c,
+			     uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS],
+			     uint32_t * ww)
+{
+	STORE_W(i, w, ww);
+	UPDATE_E1(a, b, c, d, e, i, w);
+	UPDATE_B(b);
+}
+
+static inline void step16_19(int i, uint32_t * a, uint32_t * b, uint32_t * c,
+			     uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS])
+{
+	UPDATE_W(i, w);
+	UPDATE_E1(a, b, c, d, e, i, w);
+	UPDATE_B(b);
+
+}
+
+static inline void step20_39(int i, uint32_t * a, uint32_t * b, uint32_t * c,
+			     uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS])
+{
+	UPDATE_W(i, w);
+	UPDATE_E2(a, b, c, d, e, i, w);
+	UPDATE_B(b);
+}
+
+static inline void step40_59(int i, uint32_t * a, uint32_t * b, uint32_t * c,
+			     uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS])
+{
+	UPDATE_W(i, w);
+	UPDATE_E3(a, b, c, d, e, i, w);
+	UPDATE_B(b);
+}
+
+static inline void step60_79(int i, uint32_t * a, uint32_t * b, uint32_t * c,
+			     uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS])
+{
+	UPDATE_W(i, w);
+	UPDATE_E4(a, b, c, d, e, i, w);
+	UPDATE_B(b);
+}
+
+static inline void init_abcde(uint32_t * xx, uint32_t n,
+			      uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS])
+{
+	xx[0] = digests[n][0];
+	xx[1] = digests[n][1];
+	xx[2] = digests[n][2];
+	xx[3] = digests[n][3];
+	xx[4] = digests[n][4];
+	xx[5] = digests[n][5];
+	xx[6] = digests[n][6];
+	xx[7] = digests[n][7];
+	xx[8] = digests[n][8];
+	xx[9] = digests[n][9];
+	xx[10] = digests[n][10];
+	xx[11] = digests[n][11];
+	xx[12] = digests[n][12];
+	xx[13] = digests[n][13];
+	xx[14] = digests[n][14];
+	xx[15] = digests[n][15];
+}
+
+static inline void add_abcde(uint32_t * xx, uint32_t n,
+			     uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS])
+{
+	digests[n][0] += xx[0];
+	digests[n][1] += xx[1];
+	digests[n][2] += xx[2];
+	digests[n][3] += xx[3];
+	digests[n][4] += xx[4];
+	digests[n][5] += xx[5];
+	digests[n][6] += xx[6];
+	digests[n][7] += xx[7];
+	digests[n][8] += xx[8];
+	digests[n][9] += xx[9];
+	digests[n][10] += xx[10];
+	digests[n][11] += xx[11];
+	digests[n][12] += xx[12];
+	digests[n][13] += xx[13];
+	digests[n][14] += xx[14];
+	digests[n][15] += xx[15];
+}
+
+/*
+ * API to perform 0-79 steps of the multi-hash algorithm for
+ * a single block of data. The caller is responsible for ensuring
+ * a full block of data input.
+ *
+ * Argument:
+ *   input  - the pointer to the data
+ *   digest - the space to hold the digests for all segments.
+ *
+ * Return:
+ *   N/A
+ */
+void mh_sha1_single(const uint8_t * input, uint32_t(*digests)[HASH_SEGS],
+		    uint8_t * frame_buffer)
+{
+	uint32_t aa[HASH_SEGS], bb[HASH_SEGS], cc[HASH_SEGS], dd[HASH_SEGS], ee[HASH_SEGS];
+	uint32_t *ww = (uint32_t *) input;
+	uint32_t(*w)[HASH_SEGS];
+
+	w = (uint32_t(*)[HASH_SEGS]) frame_buffer;
+
+	init_abcde(aa, 0, digests);
+	init_abcde(bb, 1, digests);
+	init_abcde(cc, 2, digests);
+	init_abcde(dd, 3, digests);
+	init_abcde(ee, 4, digests);
+
+	step00_15(0, aa, bb, cc, dd, ee, w, ww);
+	step00_15(1, ee, aa, bb, cc, dd, w, ww);
+	step00_15(2, dd, ee, aa, bb, cc, w, ww);
+	step00_15(3, cc, dd, ee, aa, bb, w, ww);
+	step00_15(4, bb, cc, dd, ee, aa, w, ww);
+	step00_15(5, aa, bb, cc, dd, ee, w, ww);
+	step00_15(6, ee, aa, bb, cc, dd, w, ww);
+	step00_15(7, dd, ee, aa, bb, cc, w, ww);
+	step00_15(8, cc, dd, ee, aa, bb, w, ww);
+	step00_15(9, bb, cc, dd, ee, aa, w, ww);
+	step00_15(10, aa, bb, cc, dd, ee, w, ww);
+	step00_15(11, ee, aa, bb, cc, dd, w, ww);
+	step00_15(12, dd, ee, aa, bb, cc, w, ww);
+	step00_15(13, cc, dd, ee, aa, bb, w, ww);
+	step00_15(14, bb, cc, dd, ee, aa, w, ww);
+	step00_15(15, aa, bb, cc, dd, ee, w, ww);
+
+	step16_19(16, ee, aa, bb, cc, dd, w);
+	step16_19(17, dd, ee, aa, bb, cc, w);
+	step16_19(18, cc, dd, ee, aa, bb, w);
+	step16_19(19, bb, cc, dd, ee, aa, w);
+
+	step20_39(20, aa, bb, cc, dd, ee, w);
+	step20_39(21, ee, aa, bb, cc, dd, w);
+	step20_39(22, dd, ee, aa, bb, cc, w);
+	step20_39(23, cc, dd, ee, aa, bb, w);
+	step20_39(24, bb, cc, dd, ee, aa, w);
+	step20_39(25, aa, bb, cc, dd, ee, w);
+	step20_39(26, ee, aa, bb, cc, dd, w);
+	step20_39(27, dd, ee, aa, bb, cc, w);
+	step20_39(28, cc, dd, ee, aa, bb, w);
+	step20_39(29, bb, cc, dd, ee, aa, w);
+	step20_39(30, aa, bb, cc, dd, ee, w);
+	step20_39(31, ee, aa, bb, cc, dd, w);
+	step20_39(32, dd, ee, aa, bb, cc, w);
+	step20_39(33, cc, dd, ee, aa, bb, w);
+	step20_39(34, bb, cc, dd, ee, aa, w);
+	step20_39(35, aa, bb, cc, dd, ee, w);
+	step20_39(36, ee, aa, bb, cc, dd, w);
+	step20_39(37, dd, ee, aa, bb, cc, w);
+	step20_39(38, cc, dd, ee, aa, bb, w);
+	step20_39(39, bb, cc, dd, ee, aa, w);
+
+	step40_59(40, aa, bb, cc, dd, ee, w);
+	step40_59(41, ee, aa, bb, cc, dd, w);
+	step40_59(42, dd, ee, aa, bb, cc, w);
+	step40_59(43, cc, dd, ee, aa, bb, w);
+	step40_59(44, bb, cc, dd, ee, aa, w);
+	step40_59(45, aa, bb, cc, dd, ee, w);
+	step40_59(46, ee, aa, bb, cc, dd, w);
+	step40_59(47, dd, ee, aa, bb, cc, w);
+	step40_59(48, cc, dd, ee, aa, bb, w);
+	step40_59(49, bb, cc, dd, ee, aa, w);
+	step40_59(50, aa, bb, cc, dd, ee, w);
+	step40_59(51, ee, aa, bb, cc, dd, w);
+	step40_59(52, dd, ee, aa, bb, cc, w);
+	step40_59(53, cc, dd, ee, aa, bb, w);
+	step40_59(54, bb, cc, dd, ee, aa, w);
+	step40_59(55, aa, bb, cc, dd, ee, w);
+	step40_59(56, ee, aa, bb, cc, dd, w);
+	step40_59(57, dd, ee, aa, bb, cc, w);
+	step40_59(58, cc, dd, ee, aa, bb, w);
+	step40_59(59, bb, cc, dd, ee, aa, w);
+
+	step60_79(60, aa, bb, cc, dd, ee, w);
+	step60_79(61, ee, aa, bb, cc, dd, w);
+	step60_79(62, dd, ee, aa, bb, cc, w);
+	step60_79(63, cc, dd, ee, aa, bb, w);
+	step60_79(64, bb, cc, dd, ee, aa, w);
+	step60_79(65, aa, bb, cc, dd, ee, w);
+	step60_79(66, ee, aa, bb, cc, dd, w);
+	step60_79(67, dd, ee, aa, bb, cc, w);
+	step60_79(68, cc, dd, ee, aa, bb, w);
+	step60_79(69, bb, cc, dd, ee, aa, w);
+	step60_79(70, aa, bb, cc, dd, ee, w);
+	step60_79(71, ee, aa, bb, cc, dd, w);
+	step60_79(72, dd, ee, aa, bb, cc, w);
+	step60_79(73, cc, dd, ee, aa, bb, w);
+	step60_79(74, bb, cc, dd, ee, aa, w);
+	step60_79(75, aa, bb, cc, dd, ee, w);
+	step60_79(76, ee, aa, bb, cc, dd, w);
+	step60_79(77, dd, ee, aa, bb, cc, w);
+	step60_79(78, cc, dd, ee, aa, bb, w);
+	step60_79(79, bb, cc, dd, ee, aa, w);
+
+	add_abcde(aa, 0, digests);
+	add_abcde(bb, 1, digests);
+	add_abcde(cc, 2, digests);
+	add_abcde(dd, 3, digests);
+	add_abcde(ee, 4, digests);
+}
+
+void mh_sha1_block_base(const uint8_t * input_data,
+			uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+			uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks)
+{
+	uint32_t i;
+
+	for (i = 0; i < num_blocks; i++) {
+		mh_sha1_single(input_data, digests, frame_buffer);
+		input_data += MH_SHA1_BLOCK_SIZE;
+	}
+
+	return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_sse.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_sse.asm
new file mode 100644
index 000000000..3d75d1649
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_sse.asm
@@ -0,0 +1,498 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using SSE
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T   ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    movdqa  %%regF,%%regC
+    pxor  %%regF,%%regD
+    pand  %%regF,%%regB
+    pxor  %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T   ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    movdqa  %%regF,%%regD
+    pxor  %%regF,%%regC
+    pxor  %%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T   ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    movdqa  %%regF,%%regB
+    movdqa  %%regT,%%regB
+    por   %%regF,%%regC
+    pand  %%regT,%%regC
+    pand  %%regF,%%regD
+    por   %%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T   ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	movdqa  %%tmp, %%reg
+	pslld   %%reg, %%imm
+	psrld   %%tmp, (32-%%imm)
+	por     %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 11
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+	paddd   %%regE,%%immCNT
+	paddd   %%regE,[%%data + (%%memW * 16)]
+	movdqa  %%regT,%%regA
+	PROLD   %%regT,5, %%regF
+	paddd   %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD   %%regB,30, %%regT
+	paddd   %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 11
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+	paddd   %%regE,%%immCNT
+	movdqa  W14, [%%data + ((%%memW - 14) & 15) * 16]
+	pxor    W16, W14
+	pxor    W16, [%%data + ((%%memW -  8) & 15) * 16]
+	pxor    W16, [%%data + ((%%memW -  3) & 15) * 16]
+	movdqa  %%regF, W16
+	pslld   W16, 1
+	psrld   %%regF, (32-1)
+	por     %%regF, W16
+	ROTATE_W
+
+	movdqa  [%%data + ((%%memW - 0) & 15) * 16],%%regF
+	paddd   %%regE,%%regF
+	movdqa  %%regT,%%regA
+	PROLD   %%regT,5, %%regF
+	paddd   %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD   %%regB,30, %%regT
+	paddd   %%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp1  r10
+ %define tmp2  r11
+ %define tmp3  r12		; must be saved and restored
+ %define tmp4  r13		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r10
+ %define arg5   r11
+ %define tmp1   r12		; must be saved and restored
+ %define tmp2   r13		; must be saved and restored
+ %define tmp3   r14		; must be saved and restored
+ %define tmp4   r15		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define return rax
+
+ %define stack_size  10*16 + 7*8		; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	save_reg	rsi,  10*16 + 5*8
+	end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	mov	rsi,  [rsp + 10*16 + 5*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops 		arg3
+;variables of mh_sha1
+%define mh_in_p  	arg0
+%define mh_digests_p 	arg1
+%define mh_data_p	arg2
+%define mh_segs  	tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE	tmp2
+%define FRAMESZ 	4*5*16		;BYTES*DWORDS*SEGS
+
+%define pref		tmp3
+%macro PREFETCH_X 1
+%define %%mem  %1
+	prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define MOVPS   movups
+
+%define A       xmm0
+%define B       xmm1
+%define C       xmm2
+%define D       xmm3
+%define E       xmm4
+%define F       xmm5 ; tmp
+%define G       xmm6 ; tmp
+
+%define TMP     G
+%define FUN     F
+%define K       xmm7
+
+%define AA      xmm8
+%define BB      xmm9
+%define CC      xmm10
+%define DD      xmm11
+%define EE      xmm12
+
+%define T0      xmm6
+%define T1      xmm7
+%define T2      xmm8
+%define T3      xmm9
+%define T4      xmm10
+%define T5      xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14     xmm13
+%define W15     xmm14
+%define W16     xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a  | b  |  c | ...|  p | (16)
+; h0 | h0 | h0 | ...| h0 |    | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 |    | Ba| Bb | Bc |...| Bp |
+; ....
+; h4 | h4 | h4 | ...| h4 |    | Ea| Eb | Ec |...| Ep |
+
+align 32
+
+;void mh_sha1_block_sse(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+;		uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number  of 1KB blocks
+;
+mk_global mh_sha1_block_sse, function, internal
+func(mh_sha1_block_sse)
+	endbranch
+	FUNC_SAVE
+	; save rsp
+	mov	RSP_SAVE, rsp
+
+	cmp	loops, 0
+	jle	.return
+
+	; leave enough space to store segs_digests
+	sub     rsp, FRAMESZ
+	; align rsp to 16 Bytes needed by sse
+	and	rsp, ~0x0F
+
+ %assign I 0					; copy segs_digests into stack
+ %rep 5
+	MOVPS  A, [mh_digests_p + I*64 + 16*0]
+	MOVPS  B, [mh_digests_p + I*64 + 16*1]
+	MOVPS  C, [mh_digests_p + I*64 + 16*2]
+	MOVPS  D, [mh_digests_p + I*64 + 16*3]
+
+	movdqa [rsp + I*64 + 16*0], A
+	movdqa [rsp + I*64 + 16*1], B
+	movdqa [rsp + I*64 + 16*2], C
+	movdqa [rsp + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+
+.block_loop:
+	;transform to big-endian data and store on aligned_frame
+	movdqa  F, [PSHUFFLE_BYTE_FLIP_MASK]
+	;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4
+ %assign I 0
+ %rep 16
+	MOVPS   T0,[mh_in_p + I*64+0*16]
+	MOVPS   T1,[mh_in_p + I*64+1*16]
+	MOVPS   T2,[mh_in_p + I*64+2*16]
+	MOVPS   T3,[mh_in_p + I*64+3*16]
+
+	pshufb  T0, F
+	movdqa  [mh_data_p +(I)*16 +0*256],T0
+	pshufb  T1, F
+	movdqa  [mh_data_p +(I)*16 +1*256],T1
+	pshufb  T2, F
+	movdqa  [mh_data_p +(I)*16 +2*256],T2
+	pshufb  T3, F
+	movdqa  [mh_data_p +(I)*16 +3*256],T3
+ %assign I (I+1)
+ %endrep
+
+	mov	mh_segs, 0			;start from the first 4 segments
+	mov	pref, 1024				;avoid prefetch repeadtedly
+ .segs_loop:
+	;; Initialize digests
+	movdqa  A, [rsp + 0*64 + mh_segs]
+	movdqa  B, [rsp + 1*64 + mh_segs]
+	movdqa  C, [rsp + 2*64 + mh_segs]
+	movdqa  D, [rsp + 3*64 + mh_segs]
+	movdqa  E, [rsp + 4*64 + mh_segs]
+
+	movdqa  AA, A
+	movdqa  BB, B
+	movdqa  CC, C
+	movdqa  DD, D
+	movdqa  EE, E
+;;
+;; perform 0-79 steps
+;;
+	movdqa  K, [K00_19]
+;; do rounds 0...15
+ %assign I 0
+ %rep 16
+	SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 16...19
+	movdqa  W16, [mh_data_p + ((16 - 16) & 15) * 16]
+	movdqa  W15, [mh_data_p + ((16 - 15) & 15) * 16]
+ %rep 4
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+	PREFETCH_X [mh_in_p + pref+128*0]
+;; do rounds 20...39
+	movdqa  K, [K20_39]
+ %rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 40...59
+	movdqa  K, [K40_59]
+ %rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+	PREFETCH_X [mh_in_p + pref+128*1]
+;; do rounds 60...79
+	movdqa  K, [K60_79]
+ %rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+	paddd  A, AA
+	paddd  B, BB
+	paddd  C, CC
+	paddd  D, DD
+	paddd  E, EE
+
+	; write out digests
+	movdqa  [rsp + 0*64 + mh_segs], A
+	movdqa  [rsp + 1*64 + mh_segs], B
+	movdqa  [rsp + 2*64 + mh_segs], C
+	movdqa  [rsp + 3*64 + mh_segs], D
+	movdqa  [rsp + 4*64 + mh_segs], E
+
+	add	pref,      256
+	add	mh_data_p, 256
+	add 	mh_segs,   16
+	cmp	mh_segs,   64
+	jc 	.segs_loop
+
+	sub	mh_data_p, (1024)
+	add 	mh_in_p,   (1024)
+	sub     loops, 1
+	jne     .block_loop
+
+
+ %assign I 0					; copy segs_digests back to mh_digests_p
+ %rep 5
+	movdqa A, [rsp + I*64 + 16*0]
+	movdqa B, [rsp + I*64 + 16*1]
+	movdqa C, [rsp + I*64 + 16*2]
+	movdqa D, [rsp + I*64 + 16*3]
+
+	MOVPS  [mh_digests_p + I*64 + 16*0], A
+	MOVPS  [mh_digests_p + I*64 + 16*1], B
+	MOVPS  [mh_digests_p + I*64 + 16*2], C
+	MOVPS  [mh_digests_p + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+	mov	rsp, RSP_SAVE			; restore rsp
+
+.return:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+K00_19:                  dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39:                  dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59:                  dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79:                  dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_finalize_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_finalize_base.c
new file mode 100644
index 000000000..3058aaa87
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_finalize_base.c
@@ -0,0 +1,122 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/*
+ * mh_sha1_finalize_base.c contains the prototypes of mh_sha1_finalize_XXX
+ * and mh_sha1_tail_XXX. Default definitions are base type which generates
+ * mh_sha1_finalize_base and mh_sha1_tail_base. Other types are generated
+ * through different predefined macros by mh_sha1.c.
+ * mh_sha1_tail is used to calculate the last incomplete block of input
+ * data. mh_sha1_finalize is the mh_sha1_ctx wrapper of mh_sha1_tail.
+ */
+#ifndef MH_SHA1_FINALIZE_FUNCTION
+#include <string.h>
+#include "mh_sha1_internal.h"
+
+#define MH_SHA1_FINALIZE_FUNCTION	mh_sha1_finalize_base
+#define MH_SHA1_TAIL_FUNCTION		mh_sha1_tail_base
+#define MH_SHA1_BLOCK_FUNCTION		mh_sha1_block_base
+#define MH_SHA1_FINALIZE_SLVER
+#endif
+
+void MH_SHA1_TAIL_FUNCTION(uint8_t * partial_buffer, uint32_t total_len,
+			   uint32_t(*mh_sha1_segs_digests)[HASH_SEGS], uint8_t * frame_buffer,
+			   uint32_t digests[SHA1_DIGEST_WORDS])
+{
+	uint64_t partial_buffer_len, len_in_bit;
+
+	partial_buffer_len = total_len % MH_SHA1_BLOCK_SIZE;
+
+	// Padding the first block
+	partial_buffer[partial_buffer_len] = 0x80;
+	partial_buffer_len++;
+	memset(partial_buffer + partial_buffer_len, 0,
+	       MH_SHA1_BLOCK_SIZE - partial_buffer_len);
+
+	// Calculate the first block without total_length if padding needs 2 block
+	if (partial_buffer_len > (MH_SHA1_BLOCK_SIZE - 8)) {
+		MH_SHA1_BLOCK_FUNCTION(partial_buffer, mh_sha1_segs_digests, frame_buffer, 1);
+		//Padding the second block
+		memset(partial_buffer, 0, MH_SHA1_BLOCK_SIZE);
+	}
+	//Padding the block
+	len_in_bit = to_be64((uint64_t) total_len * 8);
+	*(uint64_t *) (partial_buffer + MH_SHA1_BLOCK_SIZE - 8) = len_in_bit;
+	MH_SHA1_BLOCK_FUNCTION(partial_buffer, mh_sha1_segs_digests, frame_buffer, 1);
+
+	//Calculate multi-hash SHA1 digests (segment digests as input message)
+	sha1_for_mh_sha1((uint8_t *) mh_sha1_segs_digests, digests,
+			 4 * SHA1_DIGEST_WORDS * HASH_SEGS);
+
+	return;
+}
+
+int MH_SHA1_FINALIZE_FUNCTION(struct mh_sha1_ctx *ctx, void *mh_sha1_digest)
+{
+	uint8_t *partial_block_buffer;
+	uint64_t total_len;
+	uint32_t(*mh_sha1_segs_digests)[HASH_SEGS];
+	uint8_t *aligned_frame_buffer;
+
+	if (ctx == NULL)
+		return MH_SHA1_CTX_ERROR_NULL;
+
+	total_len = ctx->total_length;
+	partial_block_buffer = ctx->partial_block_buffer;
+
+	/* mh_sha1 tail */
+	aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer);
+	mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests;
+
+	MH_SHA1_TAIL_FUNCTION(partial_block_buffer, total_len, mh_sha1_segs_digests,
+			      aligned_frame_buffer, ctx->mh_sha1_digest);
+
+	/* Output the digests of mh_sha1 */
+	if (mh_sha1_digest != NULL) {
+		((uint32_t *) mh_sha1_digest)[0] = ctx->mh_sha1_digest[0];
+		((uint32_t *) mh_sha1_digest)[1] = ctx->mh_sha1_digest[1];
+		((uint32_t *) mh_sha1_digest)[2] = ctx->mh_sha1_digest[2];
+		((uint32_t *) mh_sha1_digest)[3] = ctx->mh_sha1_digest[3];
+		((uint32_t *) mh_sha1_digest)[4] = ctx->mh_sha1_digest[4];
+	}
+
+	return MH_SHA1_CTX_ERROR_NONE;
+}
+
+#ifdef MH_SHA1_FINALIZE_SLVER
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+
+// Version info
+struct slver mh_sha1_finalize_base_slver_0000027b;
+struct slver mh_sha1_finalize_base_slver = { 0x027b, 0x00, 0x00 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_internal.h b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_internal.h
new file mode 100644
index 000000000..81823048e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_internal.h
@@ -0,0 +1,308 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA1_INTERNAL_H_
+#define _MH_SHA1_INTERNAL_H_
+
+/**
+ *  @file mh_sha1_internal.h
+ *  @brief mh_sha1 internal function prototypes and macros
+ *
+ *  Interface for mh_sha1 internal functions
+ *
+ */
+#include <stdint.h>
+#include "mh_sha1.h"
+#include "endian_helper.h"
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+ // 64byte pointer align
+#define ALIGN_64(pointer) ( ((uint64_t)(pointer) + 0x3F)&(~0x3F) )
+
+ /*******************************************************************
+  *mh_sha1 constants and macros
+  ******************************************************************/
+ /* mh_sha1 constants */
+#define MH_SHA1_H0 0x67452301UL
+#define MH_SHA1_H1 0xefcdab89UL
+#define MH_SHA1_H2 0x98badcfeUL
+#define MH_SHA1_H3 0x10325476UL
+#define MH_SHA1_H4 0xc3d2e1f0UL
+
+#define K_00_19	0x5a827999UL
+#define K_20_39 0x6ed9eba1UL
+#define K_40_59 0x8f1bbcdcUL
+#define K_60_79 0xca62c1d6UL
+
+ /* mh_sha1 macros */
+#define F1(b,c,d) (d ^ (b & (c ^ d)))
+#define F2(b,c,d) (b ^ c ^ d)
+#define F3(b,c,d) ((b & c) | (d & (b | c)))
+#define F4(b,c,d) (b ^ c ^ d)
+
+#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r))))
+
+ /*******************************************************************
+  * SHA1 API internal function prototypes
+  ******************************************************************/
+
+ /**
+  * @brief Performs complete SHA1 algorithm.
+  *
+  * @param input  Pointer to buffer containing the input message.
+  * @param digest Pointer to digest to update.
+  * @param len	  Length of buffer.
+  * @returns None
+  */
+ void sha1_for_mh_sha1(const uint8_t * input_data, uint32_t * digest, const uint32_t len);
+
+ /*******************************************************************
+  * mh_sha1 API internal function prototypes
+  * Multiple versions of Update and Finalize functions are supplied which use
+  * multiple versions of block and tail process subfunctions.
+  ******************************************************************/
+
+ /**
+  * @brief  Tail process for multi-hash sha1.
+  *
+  * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE.
+  * It will output the final SHA1 digest based on mh_sha1_segs_digests.
+  *
+  * This function determines what instruction sets are enabled and selects the
+  * appropriate version at runtime.
+  *
+  * @param  partial_buffer Pointer to the start addr of remainder
+  * @param  total_len The total length of all sections of input data.
+  * @param  mh_sha1_segs_digests The digests of all 16 segments .
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @returns none
+  *
+  */
+ void mh_sha1_tail(uint8_t *partial_buffer, uint32_t total_len,
+			 uint32_t (*mh_sha1_segs_digests)[HASH_SEGS],
+			 uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+ /**
+  * @brief  Tail process for multi-hash sha1.
+  *
+  * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE.
+  * It will output the final SHA1 digest based on mh_sha1_segs_digests.
+  *
+  * @param  partial_buffer Pointer to the start addr of remainder
+  * @param  total_len The total length of all sections of input data.
+  * @param  mh_sha1_segs_digests The digests of all 16 segments .
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  mh_sha1_digest mh_sha1 digest
+  * @returns none
+  *
+  */
+ void mh_sha1_tail_base(uint8_t *partial_buffer, uint32_t total_len,
+			 uint32_t (*mh_sha1_segs_digests)[HASH_SEGS],
+			 uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+ /**
+  * @brief  Tail process for multi-hash sha1.
+  *
+  * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE.
+  * It will output the final SHA1 digest based on mh_sha1_segs_digests.
+  *
+  * @requires SSE
+  *
+  * @param  partial_buffer Pointer to the start addr of remainder
+  * @param  total_len The total length of all sections of input data.
+  * @param  mh_sha1_segs_digests The digests of all 16 segments .
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  mh_sha1_digest mh_sha1 digest
+  * @returns none
+  *
+  */
+ void mh_sha1_tail_sse(uint8_t *partial_buffer, uint32_t total_len,
+			 uint32_t (*mh_sha1_segs_digests)[HASH_SEGS],
+			 uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+ /**
+  * @brief  Tail process for multi-hash sha1.
+  *
+  * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE.
+  * It will output the final SHA1 digest based on mh_sha1_segs_digests.
+  *
+  * @requires AVX
+  *
+  * @param  partial_buffer Pointer to the start addr of remainder
+  * @param  total_len The total length of all sections of input data.
+  * @param  mh_sha1_segs_digests The digests of all 16 segments .
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  mh_sha1_digest mh_sha1 digest
+  * @returns none
+  *
+  */
+ void mh_sha1_tail_avx(uint8_t *partial_buffer, uint32_t total_len,
+			 uint32_t (*mh_sha1_segs_digests)[HASH_SEGS],
+			 uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+ /**
+  * @brief  Tail process for multi-hash sha1.
+  *
+  * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE.
+  * It will output the final SHA1 digest based on mh_sha1_segs_digests.
+  *
+  * @requires AVX2
+  *
+  * @param  partial_buffer Pointer to the start addr of remainder
+  * @param  total_len The total length of all sections of input data.
+  * @param  mh_sha1_segs_digests The digests of all 16 segments .
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  mh_sha1_digest mh_sha1 digest
+  * @returns none
+  *
+  */
+ void mh_sha1_tail_avx2(uint8_t *partial_buffer, uint32_t total_len,
+			 uint32_t (*mh_sha1_segs_digests)[HASH_SEGS],
+			 uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+ /**
+  * @brief  Tail process for multi-hash sha1.
+  *
+  * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE.
+  * It will output the final SHA1 digest based on mh_sha1_segs_digests.
+  *
+  * @requires AVX512
+  *
+  * @param  partial_buffer Pointer to the start addr of remainder
+  * @param  total_len The total length of all sections of input data.
+  * @param  mh_sha1_segs_digests The digests of all 16 segments .
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  mh_sha1_digest mh_sha1 digest
+  * @returns none
+  *
+  */
+ void mh_sha1_tail_avx512(uint8_t *partial_buffer, uint32_t total_len,
+			 uint32_t (*mh_sha1_segs_digests)[HASH_SEGS],
+			 uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+ /**
+  * @brief  Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N.
+  *
+  * This function determines what instruction sets are enabled and selects the
+  * appropriate version at runtime.
+  *
+  * @param  input_data Pointer to input data to be processed
+  * @param  digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+ void mh_sha1_block(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+			 uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+  * @brief  Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N.
+  *
+  * @param  input_data Pointer to input data to be processed
+  * @param  digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+ void mh_sha1_block_base(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+			 uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+  * @brief  Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N.
+  *
+  * @requires SSE
+  * @param  input_data Pointer to input data to be processed
+  * @param  digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+ void mh_sha1_block_sse(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+			 uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+  * @brief  Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N.
+  *
+  * @requires AVX
+  *
+  * @param  input_data Pointer to input data to be processed
+  * @param  digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+ void mh_sha1_block_avx(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+			 uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+  * @brief  Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N.
+  *
+  * @requires AVX2
+  *
+  * @param  input_data Pointer to input data to be processed
+  * @param  digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+ void mh_sha1_block_avx2(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+			 uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+  * @brief  Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N.
+  *
+  * @requires AVX512
+  *
+  * @param  input_data Pointer to input data to be processed
+  * @param  digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+ void mh_sha1_block_avx512(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+			 uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_multibinary.asm
new file mode 100644
index 000000000..590aa6c5f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_multibinary.asm
@@ -0,0 +1,77 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf32
+ [bits 32]
+%else
+ default rel
+ [bits 64]
+
+ extern mh_sha1_update_sse
+ extern mh_sha1_update_avx
+ extern mh_sha1_update_avx2
+ extern mh_sha1_finalize_sse
+ extern mh_sha1_finalize_avx
+ extern mh_sha1_finalize_avx2
+
+ %ifdef HAVE_AS_KNOWS_AVX512
+  extern mh_sha1_update_avx512
+  extern mh_sha1_finalize_avx512
+ %endif
+
+%endif
+
+extern mh_sha1_update_base
+extern mh_sha1_finalize_base
+
+mbin_interface mh_sha1_update
+mbin_interface mh_sha1_finalize
+
+%ifidn __OUTPUT_FORMAT__, elf64
+
+ %ifdef HAVE_AS_KNOWS_AVX512
+  mbin_dispatch_init6 mh_sha1_update, mh_sha1_update_base, mh_sha1_update_sse, mh_sha1_update_avx, mh_sha1_update_avx2, mh_sha1_update_avx512
+  mbin_dispatch_init6 mh_sha1_finalize, mh_sha1_finalize_base, mh_sha1_finalize_sse, mh_sha1_finalize_avx, mh_sha1_finalize_avx2, mh_sha1_finalize_avx512
+ %else
+  mbin_dispatch_init5 mh_sha1_update, mh_sha1_update_base, mh_sha1_update_sse, mh_sha1_update_avx, mh_sha1_update_avx2
+  mbin_dispatch_init5 mh_sha1_finalize, mh_sha1_finalize_base, mh_sha1_finalize_sse, mh_sha1_finalize_avx, mh_sha1_finalize_avx2
+ %endif
+
+%else
+ mbin_dispatch_init2 mh_sha1_update, mh_sha1_update_base
+ mbin_dispatch_init2 mh_sha1_finalize, mh_sha1_finalize_base
+%endif
+
+;;;       func                 				core, ver, snum
+slversion mh_sha1_update,				00, 02, 0272
+slversion mh_sha1_finalize,				00, 02, 0273
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_perf.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_perf.c
new file mode 100644
index 000000000..4fd6c09a1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_perf.c
@@ -0,0 +1,180 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha1.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Loop many times over same
+# define TEST_LEN     16*1024
+# define TEST_LOOPS   20000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+# define TEST_LEN     32*1024*1024
+# define TEST_LOOPS   100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#define TEST_MEM   TEST_LEN
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type)		func##type
+#define FUNC_TOKEN(func, type)		_FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA1_FUNC_TYPE
+#define	MH_SHA1_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION		FUNC_TOKEN(mh_sha1_update, MH_SHA1_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION		FUNC_TOKEN(mh_sha1_finalize, MH_SHA1_FUNC_TYPE)
+
+#define CHECK_RETURN(state)		do{ \
+					  if((state) != MH_SHA1_CTX_ERROR_NONE){ \
+					    printf("The mh_sha1 function is failed.\n"); \
+					    return 1; \
+					  } \
+					}while(0)
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 20 == 0)
+			printf("\n");
+	}
+	if (i % 20 != 0)
+		printf("\n");
+}
+
+int compare_digests(uint32_t hash_base[SHA1_DIGEST_WORDS],
+		    uint32_t hash_test[SHA1_DIGEST_WORDS])
+{
+	int i;
+	int mh_sha1_fail = 0;
+
+	for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+		if (hash_test[i] != hash_base[i])
+			mh_sha1_fail++;
+	}
+
+	if (mh_sha1_fail) {
+		printf("mh_sha1 fail test\n");
+		printf("base: ");
+		dump((char *)hash_base, 20);
+		printf("ref: ");
+		dump((char *)hash_test, 20);
+	}
+
+	return mh_sha1_fail;
+}
+
+int main(int argc, char *argv[])
+{
+	int i, fail = 0;
+	uint32_t hash_test[SHA1_DIGEST_WORDS], hash_base[SHA1_DIGEST_WORDS];
+	uint8_t *buff = NULL;
+	struct mh_sha1_ctx *update_ctx_test = NULL, *update_ctx_base = NULL;
+	struct perf start, stop;
+
+	printf(xstr(TEST_UPDATE_FUNCTION) "_perf:\n");
+
+	buff = malloc(TEST_LEN);
+	update_ctx_test = malloc(sizeof(*update_ctx_test));
+	update_ctx_base = malloc(sizeof(*update_ctx_base));
+
+	if (buff == NULL || update_ctx_base == NULL || update_ctx_test == NULL) {
+		printf("malloc failed test aborted\n");
+		return -1;
+	}
+	// Rand test1
+	rand_buffer(buff, TEST_LEN);
+
+	// mh_sha1 base version
+	mh_sha1_init(update_ctx_base);
+	mh_sha1_update_base(update_ctx_base, buff, TEST_LEN);
+	mh_sha1_finalize_base(update_ctx_base, hash_base);
+
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS / 10; i++) {
+		mh_sha1_init(update_ctx_base);
+		mh_sha1_update_base(update_ctx_base, buff, TEST_LEN);
+		mh_sha1_finalize_base(update_ctx_base, hash_base);
+	}
+	perf_stop(&stop);
+	printf("mh_sha1_update_base" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_MEM * i);
+
+	//Update feature test
+	CHECK_RETURN(mh_sha1_init(update_ctx_test));
+	CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx_test, buff, TEST_LEN));
+	CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx_test, hash_test));
+
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++) {
+		CHECK_RETURN(mh_sha1_init(update_ctx_test));
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx_test, buff, TEST_LEN));
+		CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx_test, hash_test));
+	}
+	perf_stop(&stop);
+	printf(xstr(TEST_UPDATE_FUNCTION) TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_MEM * i);
+
+	// Check results
+	fail = compare_digests(hash_base, hash_test);
+
+	if (fail) {
+		printf("Fail size=%d\n", TEST_LEN);
+		return -1;
+	}
+
+	if (fail)
+		printf("Test failed function test%d\n", fail);
+	else
+		printf("Pass func check\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_ref.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_ref.c
new file mode 100644
index 000000000..71caba50e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_ref.c
@@ -0,0 +1,430 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_internal.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+ //  Macros and sub-functions which already exist in source code file
+ //  (sha1_for_mh_sha1.c) is part of ISA-L library as internal functions.
+ //  The reason why writing them twice is the linking issue caused by
+ //  mh_sha1_ref(). mh_sha1_ref() needs these macros and sub-functions
+ //  without linking ISA-L library. So mh_sha1_ref() includes them in
+ //  order to contain essential sub-functions in its own object file.
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define W(x) w[(x) & 15]
+
+#define step00_19(i,a,b,c,d,e) \
+	if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+	else W(i) = to_be32(ww[i]); \
+	e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \
+	b = rol32(b,30)
+
+#define step20_39(i,a,b,c,d,e) \
+	W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+	e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \
+	b = rol32(b,30)
+
+#define step40_59(i,a,b,c,d,e) \
+	W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+	e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \
+	b = rol32(b,30)
+
+#define step60_79(i,a,b,c,d,e) \
+	W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+	e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \
+	b = rol32(b,30)
+
+static void OPT_FIX sha1_single_for_mh_sha1_ref(const uint8_t * data, uint32_t digest[])
+{
+	uint32_t a, b, c, d, e;
+	uint32_t w[16] = { 0 };
+	uint32_t *ww = (uint32_t *) data;
+
+	a = digest[0];
+	b = digest[1];
+	c = digest[2];
+	d = digest[3];
+	e = digest[4];
+
+	step00_19(0, a, b, c, d, e);
+	step00_19(1, e, a, b, c, d);
+	step00_19(2, d, e, a, b, c);
+	step00_19(3, c, d, e, a, b);
+	step00_19(4, b, c, d, e, a);
+	step00_19(5, a, b, c, d, e);
+	step00_19(6, e, a, b, c, d);
+	step00_19(7, d, e, a, b, c);
+	step00_19(8, c, d, e, a, b);
+	step00_19(9, b, c, d, e, a);
+	step00_19(10, a, b, c, d, e);
+	step00_19(11, e, a, b, c, d);
+	step00_19(12, d, e, a, b, c);
+	step00_19(13, c, d, e, a, b);
+	step00_19(14, b, c, d, e, a);
+	step00_19(15, a, b, c, d, e);
+	step00_19(16, e, a, b, c, d);
+	step00_19(17, d, e, a, b, c);
+	step00_19(18, c, d, e, a, b);
+	step00_19(19, b, c, d, e, a);
+
+	step20_39(20, a, b, c, d, e);
+	step20_39(21, e, a, b, c, d);
+	step20_39(22, d, e, a, b, c);
+	step20_39(23, c, d, e, a, b);
+	step20_39(24, b, c, d, e, a);
+	step20_39(25, a, b, c, d, e);
+	step20_39(26, e, a, b, c, d);
+	step20_39(27, d, e, a, b, c);
+	step20_39(28, c, d, e, a, b);
+	step20_39(29, b, c, d, e, a);
+	step20_39(30, a, b, c, d, e);
+	step20_39(31, e, a, b, c, d);
+	step20_39(32, d, e, a, b, c);
+	step20_39(33, c, d, e, a, b);
+	step20_39(34, b, c, d, e, a);
+	step20_39(35, a, b, c, d, e);
+	step20_39(36, e, a, b, c, d);
+	step20_39(37, d, e, a, b, c);
+	step20_39(38, c, d, e, a, b);
+	step20_39(39, b, c, d, e, a);
+
+	step40_59(40, a, b, c, d, e);
+	step40_59(41, e, a, b, c, d);
+	step40_59(42, d, e, a, b, c);
+	step40_59(43, c, d, e, a, b);
+	step40_59(44, b, c, d, e, a);
+	step40_59(45, a, b, c, d, e);
+	step40_59(46, e, a, b, c, d);
+	step40_59(47, d, e, a, b, c);
+	step40_59(48, c, d, e, a, b);
+	step40_59(49, b, c, d, e, a);
+	step40_59(50, a, b, c, d, e);
+	step40_59(51, e, a, b, c, d);
+	step40_59(52, d, e, a, b, c);
+	step40_59(53, c, d, e, a, b);
+	step40_59(54, b, c, d, e, a);
+	step40_59(55, a, b, c, d, e);
+	step40_59(56, e, a, b, c, d);
+	step40_59(57, d, e, a, b, c);
+	step40_59(58, c, d, e, a, b);
+	step40_59(59, b, c, d, e, a);
+
+	step60_79(60, a, b, c, d, e);
+	step60_79(61, e, a, b, c, d);
+	step60_79(62, d, e, a, b, c);
+	step60_79(63, c, d, e, a, b);
+	step60_79(64, b, c, d, e, a);
+	step60_79(65, a, b, c, d, e);
+	step60_79(66, e, a, b, c, d);
+	step60_79(67, d, e, a, b, c);
+	step60_79(68, c, d, e, a, b);
+	step60_79(69, b, c, d, e, a);
+	step60_79(70, a, b, c, d, e);
+	step60_79(71, e, a, b, c, d);
+	step60_79(72, d, e, a, b, c);
+	step60_79(73, c, d, e, a, b);
+	step60_79(74, b, c, d, e, a);
+	step60_79(75, a, b, c, d, e);
+	step60_79(76, e, a, b, c, d);
+	step60_79(77, d, e, a, b, c);
+	step60_79(78, c, d, e, a, b);
+	step60_79(79, b, c, d, e, a);
+
+	digest[0] += a;
+	digest[1] += b;
+	digest[2] += c;
+	digest[3] += d;
+	digest[4] += e;
+}
+
+void sha1_for_mh_sha1_ref(const uint8_t * input_data, uint32_t * digest, const uint32_t len)
+{
+	uint32_t i, j;
+	uint8_t buf[2 * SHA1_BLOCK_SIZE];
+
+	digest[0] = MH_SHA1_H0;
+	digest[1] = MH_SHA1_H1;
+	digest[2] = MH_SHA1_H2;
+	digest[3] = MH_SHA1_H3;
+	digest[4] = MH_SHA1_H4;
+
+	i = len;
+	while (i >= SHA1_BLOCK_SIZE) {
+		sha1_single_for_mh_sha1_ref(input_data, digest);
+		input_data += SHA1_BLOCK_SIZE;
+		i -= SHA1_BLOCK_SIZE;
+	}
+
+	memcpy(buf, input_data, i);
+	buf[i++] = 0x80;
+	for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - 8); j++)
+		buf[j] = 0;
+
+	if (i > SHA1_BLOCK_SIZE - 8)
+		i = 2 * SHA1_BLOCK_SIZE;
+	else
+		i = SHA1_BLOCK_SIZE;
+
+	*(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+	sha1_single_for_mh_sha1_ref(buf, digest);
+	if (i == (2 * SHA1_BLOCK_SIZE))
+		sha1_single_for_mh_sha1_ref(buf + SHA1_BLOCK_SIZE, digest);
+}
+
+/*
+ * buffer to rearrange one segment data from one block.
+ *
+ * Layout of new_data:
+ *  segment
+ *  -------------------------
+ *   w0  |  w1  | ... |  w15
+ *
+ */
+static inline void transform_input_single(uint32_t * new_data, uint32_t * input,
+					  uint32_t segment)
+{
+	new_data[16 * segment + 0] = input[16 * 0 + segment];
+	new_data[16 * segment + 1] = input[16 * 1 + segment];
+	new_data[16 * segment + 2] = input[16 * 2 + segment];
+	new_data[16 * segment + 3] = input[16 * 3 + segment];
+	new_data[16 * segment + 4] = input[16 * 4 + segment];
+	new_data[16 * segment + 5] = input[16 * 5 + segment];
+	new_data[16 * segment + 6] = input[16 * 6 + segment];
+	new_data[16 * segment + 7] = input[16 * 7 + segment];
+	new_data[16 * segment + 8] = input[16 * 8 + segment];
+	new_data[16 * segment + 9] = input[16 * 9 + segment];
+	new_data[16 * segment + 10] = input[16 * 10 + segment];
+	new_data[16 * segment + 11] = input[16 * 11 + segment];
+	new_data[16 * segment + 12] = input[16 * 12 + segment];
+	new_data[16 * segment + 13] = input[16 * 13 + segment];
+	new_data[16 * segment + 14] = input[16 * 14 + segment];
+	new_data[16 * segment + 15] = input[16 * 15 + segment];
+}
+
+// Adapt parameters to sha1_single_for_mh_sha1_ref
+#define sha1_update_one_seg(data, digest) \
+	sha1_single_for_mh_sha1_ref((const uint8_t *)(data), (uint32_t *)(digest))
+
+/*
+ * buffer to Rearrange all segments data from one block.
+ *
+ * Layout of new_data:
+ *  segment
+ *  -------------------------
+ *   seg0:   | w0  |  w1  | ... |  w15
+ *   seg1:   | w0  |  w1  | ... |  w15
+ *   seg2:   | w0  |  w1  | ... |  w15
+ *   ....
+ *   seg15: | w0  |  w1  | ... |  w15
+ *
+ */
+static inline void transform_input(uint32_t * new_data, uint32_t * input, uint32_t block)
+{
+	uint32_t *current_input = input + block * MH_SHA1_BLOCK_SIZE / 4;
+
+	transform_input_single(new_data, current_input, 0);
+	transform_input_single(new_data, current_input, 1);
+	transform_input_single(new_data, current_input, 2);
+	transform_input_single(new_data, current_input, 3);
+	transform_input_single(new_data, current_input, 4);
+	transform_input_single(new_data, current_input, 5);
+	transform_input_single(new_data, current_input, 6);
+	transform_input_single(new_data, current_input, 7);
+	transform_input_single(new_data, current_input, 8);
+	transform_input_single(new_data, current_input, 9);
+	transform_input_single(new_data, current_input, 10);
+	transform_input_single(new_data, current_input, 11);
+	transform_input_single(new_data, current_input, 12);
+	transform_input_single(new_data, current_input, 13);
+	transform_input_single(new_data, current_input, 14);
+	transform_input_single(new_data, current_input, 15);
+
+}
+
+/*
+ * buffer to Calculate all segments' digests from one block.
+ *
+ * Layout of seg_digest:
+ *  segment
+ *  -------------------------
+ *   seg0:   | H0  |  H1  | ... |  H4
+ *   seg1:   | H0  |  H1  | ... |  H4
+ *   seg2:   | H0  |  H1  | ... |  H4
+ *   ....
+ *   seg15: | H0  |  H1  | ... |  H4
+ *
+ */
+static inline void sha1_update_all_segs(uint32_t * new_data,
+					uint32_t(*mh_sha1_seg_digests)[SHA1_DIGEST_WORDS])
+{
+	sha1_update_one_seg(&(new_data)[16 * 0], mh_sha1_seg_digests[0]);
+	sha1_update_one_seg(&(new_data)[16 * 1], mh_sha1_seg_digests[1]);
+	sha1_update_one_seg(&(new_data)[16 * 2], mh_sha1_seg_digests[2]);
+	sha1_update_one_seg(&(new_data)[16 * 3], mh_sha1_seg_digests[3]);
+	sha1_update_one_seg(&(new_data)[16 * 4], mh_sha1_seg_digests[4]);
+	sha1_update_one_seg(&(new_data)[16 * 5], mh_sha1_seg_digests[5]);
+	sha1_update_one_seg(&(new_data)[16 * 6], mh_sha1_seg_digests[6]);
+	sha1_update_one_seg(&(new_data)[16 * 7], mh_sha1_seg_digests[7]);
+	sha1_update_one_seg(&(new_data)[16 * 8], mh_sha1_seg_digests[8]);
+	sha1_update_one_seg(&(new_data)[16 * 9], mh_sha1_seg_digests[9]);
+	sha1_update_one_seg(&(new_data)[16 * 10], mh_sha1_seg_digests[10]);
+	sha1_update_one_seg(&(new_data)[16 * 11], mh_sha1_seg_digests[11]);
+	sha1_update_one_seg(&(new_data)[16 * 12], mh_sha1_seg_digests[12]);
+	sha1_update_one_seg(&(new_data)[16 * 13], mh_sha1_seg_digests[13]);
+	sha1_update_one_seg(&(new_data)[16 * 14], mh_sha1_seg_digests[14]);
+	sha1_update_one_seg(&(new_data)[16 * 15], mh_sha1_seg_digests[15]);
+}
+
+void mh_sha1_block_ref(const uint8_t * input_data, uint32_t(*digests)[HASH_SEGS],
+		       uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks)
+{
+	uint32_t i, j;
+	uint32_t *temp_buffer = (uint32_t *) frame_buffer;
+	uint32_t(*trans_digests)[SHA1_DIGEST_WORDS];
+
+	trans_digests = (uint32_t(*)[SHA1_DIGEST_WORDS]) digests;
+
+	// Re-structure seg_digests from 5*16 to 16*5
+	for (j = 0; j < HASH_SEGS; j++) {
+		for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+			temp_buffer[j * SHA1_DIGEST_WORDS + i] = digests[i][j];
+		}
+	}
+	memcpy(trans_digests, temp_buffer, 4 * SHA1_DIGEST_WORDS * HASH_SEGS);
+
+	// Calculate digests for all segments, leveraging sha1 API
+	for (i = 0; i < num_blocks; i++) {
+		transform_input(temp_buffer, (uint32_t *) input_data, i);
+		sha1_update_all_segs(temp_buffer, trans_digests);
+	}
+
+	// Re-structure seg_digests from 16*5 to 5*16
+	for (j = 0; j < HASH_SEGS; j++) {
+		for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+			temp_buffer[i * HASH_SEGS + j] = trans_digests[j][i];
+		}
+	}
+	memcpy(digests, temp_buffer, 4 * SHA1_DIGEST_WORDS * HASH_SEGS);
+
+	return;
+}
+
+void mh_sha1_tail_ref(uint8_t * partial_buffer, uint32_t total_len,
+		      uint32_t(*mh_sha1_segs_digests)[HASH_SEGS], uint8_t * frame_buffer,
+		      uint32_t digests[SHA1_DIGEST_WORDS])
+{
+	uint64_t partial_buffer_len, len_in_bit;
+
+	partial_buffer_len = total_len % MH_SHA1_BLOCK_SIZE;
+
+	// Padding the first block
+	partial_buffer[partial_buffer_len] = 0x80;
+	partial_buffer_len++;
+	memset(partial_buffer + partial_buffer_len, 0,
+	       MH_SHA1_BLOCK_SIZE - partial_buffer_len);
+
+	// Calculate the first block without total_length if padding needs 2 block
+	if (partial_buffer_len > (MH_SHA1_BLOCK_SIZE - 8)) {
+		mh_sha1_block_ref(partial_buffer, mh_sha1_segs_digests, frame_buffer, 1);
+		//Padding the second block
+		memset(partial_buffer, 0, MH_SHA1_BLOCK_SIZE);
+	}
+	//Padding the block
+	len_in_bit = to_be64((uint64_t) total_len * 8);
+	*(uint64_t *) (partial_buffer + MH_SHA1_BLOCK_SIZE - 8) = len_in_bit;
+	mh_sha1_block_ref(partial_buffer, mh_sha1_segs_digests, frame_buffer, 1);
+
+	//Calculate multi-hash SHA1 digests (segment digests as input message)
+	sha1_for_mh_sha1_ref((uint8_t *) mh_sha1_segs_digests, digests,
+			     4 * SHA1_DIGEST_WORDS * HASH_SEGS);
+
+	return;
+}
+
+void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest)
+{
+	uint64_t total_len;
+	uint64_t num_blocks;
+	uint32_t mh_sha1_segs_digests[SHA1_DIGEST_WORDS][HASH_SEGS];
+	uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE];
+	uint8_t partial_block_buffer[MH_SHA1_BLOCK_SIZE * 2];
+	uint32_t mh_sha1_hash_dword[SHA1_DIGEST_WORDS];
+	uint32_t i;
+	const uint8_t *input_data = (const uint8_t *)buffer;
+
+	/* Initialize digests of all segments */
+	for (i = 0; i < HASH_SEGS; i++) {
+		mh_sha1_segs_digests[0][i] = MH_SHA1_H0;
+		mh_sha1_segs_digests[1][i] = MH_SHA1_H1;
+		mh_sha1_segs_digests[2][i] = MH_SHA1_H2;
+		mh_sha1_segs_digests[3][i] = MH_SHA1_H3;
+		mh_sha1_segs_digests[4][i] = MH_SHA1_H4;
+	}
+
+	total_len = len;
+
+	// Calculate blocks
+	num_blocks = len / MH_SHA1_BLOCK_SIZE;
+	if (num_blocks > 0) {
+		//do num_blocks process
+		mh_sha1_block_ref(input_data, mh_sha1_segs_digests, frame_buffer, num_blocks);
+		len -= num_blocks * MH_SHA1_BLOCK_SIZE;
+		input_data += num_blocks * MH_SHA1_BLOCK_SIZE;
+	}
+	// Store the partial block
+	if (len != 0) {
+		memcpy(partial_block_buffer, input_data, len);
+	}
+
+	/* Finalize */
+	mh_sha1_tail_ref(partial_block_buffer, total_len, mh_sha1_segs_digests,
+			 frame_buffer, mh_sha1_hash_dword);
+
+	// Output the digests of mh_sha1
+	if (mh_sha1_digest != NULL) {
+		mh_sha1_digest[0] = mh_sha1_hash_dword[0];
+		mh_sha1_digest[1] = mh_sha1_hash_dword[1];
+		mh_sha1_digest[2] = mh_sha1_hash_dword[2];
+		mh_sha1_digest[3] = mh_sha1_hash_dword[3];
+		mh_sha1_digest[4] = mh_sha1_hash_dword[4];
+	}
+
+	return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_test.c
new file mode 100644
index 000000000..792c4452b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_test.c
@@ -0,0 +1,217 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha1.h"
+
+#define TEST_LEN   16*1024
+#define TEST_SIZE   8*1024
+#define TEST_MEM   TEST_LEN
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type)		func##type
+#define FUNC_TOKEN(func, type)		_FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA1_FUNC_TYPE
+#define	MH_SHA1_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION		FUNC_TOKEN(mh_sha1_update, MH_SHA1_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION		FUNC_TOKEN(mh_sha1_finalize, MH_SHA1_FUNC_TYPE)
+
+#define CHECK_RETURN(state)		do{ \
+					  if((state) != MH_SHA1_CTX_ERROR_NONE){ \
+					    printf("The mh_sha1 function is failed.\n"); \
+					    return 1; \
+					  } \
+					}while(0)
+
+extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest);
+#define MH_SHA1_REF	mh_sha1_ref
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 20 == 0)
+			printf("\n");
+	}
+	if (i % 20 != 0)
+		printf("\n");
+}
+
+int compare_digests(uint32_t hash_ref[SHA1_DIGEST_WORDS],
+		    uint32_t hash_test[SHA1_DIGEST_WORDS])
+{
+	int i;
+	int mh_sha1_fail = 0;
+
+	for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+		if (hash_test[i] != hash_ref[i])
+			mh_sha1_fail++;
+	}
+
+	if (mh_sha1_fail) {
+		printf("mh_sha1 fail test\n");
+		printf("ref: ");
+		dump((char *)hash_ref, 20);
+		printf("test: ");
+		dump((char *)hash_test, 20);
+	}
+
+	return mh_sha1_fail;
+}
+
+int main(int argc, char *argv[])
+{
+	int fail = 0;
+	uint32_t hash_test[SHA1_DIGEST_WORDS], hash_ref[SHA1_DIGEST_WORDS];
+	uint8_t *buff = NULL;
+	int size, offset;
+	struct mh_sha1_ctx *update_ctx = NULL;
+
+	printf(xstr(TEST_UPDATE_FUNCTION) "_test:\n");
+
+	srand(TEST_SEED);
+
+	buff = malloc(TEST_LEN);
+	update_ctx = malloc(sizeof(*update_ctx));
+
+	if (buff == NULL || update_ctx == NULL) {
+		printf("malloc failed test aborted\n");
+		return -1;
+	}
+	// Rand test1
+	rand_buffer(buff, TEST_LEN);
+
+	MH_SHA1_REF(buff, TEST_LEN, hash_ref);
+	CHECK_RETURN(mh_sha1_init(update_ctx));
+	CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+	CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+	fail = compare_digests(hash_ref, hash_test);
+
+	if (fail) {
+		printf("fail rand1 test\n");
+		return -1;
+	} else
+		putchar('.');
+
+	// Test various size messages
+	for (size = TEST_LEN; size >= 0; size--) {
+
+		// Fill with rand data
+		rand_buffer(buff, size);
+
+		MH_SHA1_REF(buff, size, hash_ref);
+		CHECK_RETURN(mh_sha1_init(update_ctx));
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size));
+		CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+		fail = compare_digests(hash_ref, hash_test);
+
+		if (fail) {
+			printf("Fail size=%d\n", size);
+			return -1;
+		}
+
+		if ((size & 0xff) == 0) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	// Test various buffer offsets and sizes
+	printf("offset tests");
+	for (size = TEST_LEN - 256; size > 256; size -= 11) {
+		for (offset = 0; offset < 256; offset++) {
+			MH_SHA1_REF(buff + offset, size, hash_ref);
+
+			CHECK_RETURN(mh_sha1_init(update_ctx));
+			CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size));
+			CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+			fail = compare_digests(hash_ref, hash_test);
+
+			if (fail) {
+				printf("Fail size=%d\n", size);
+				return -1;
+			}
+
+		}
+		if ((size & 0xf) == 0) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	// Run efence tests
+	printf("efence tests");
+	for (size = TEST_SIZE; size > 0; size--) {
+		offset = TEST_LEN - size;
+
+		MH_SHA1_REF(buff + offset, size, hash_ref);
+
+		CHECK_RETURN(mh_sha1_init(update_ctx));
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size));
+		CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+		fail = compare_digests(hash_ref, hash_test);
+
+		if (fail) {
+			printf("Fail size=%d\n", size);
+			return -1;
+		}
+
+		if ((size & 0xf) == 0) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	printf(xstr(TEST_UPDATE_FUNCTION) "_test:");
+	printf(" %s\n", fail == 0 ? "Pass" : "Fail");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_base.c
new file mode 100644
index 000000000..4af220299
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_base.c
@@ -0,0 +1,110 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/*
+ * mh_sha1_update_base.c contains the prototype of mh_sha1_update_XXX.
+ * Default definitions are base type which generates mh_sha1_update_base.
+ * Other types are generated through different predefined macros by mh_sha1.c.
+ */
+#ifndef MH_SHA1_UPDATE_FUNCTION
+#include "mh_sha1_internal.h"
+#include <string.h>
+
+#define MH_SHA1_UPDATE_FUNCTION			mh_sha1_update_base
+#define MH_SHA1_BLOCK_FUNCTION			mh_sha1_block_base
+#define MH_SHA1_UPDATE_SLVER
+#endif
+
+int MH_SHA1_UPDATE_FUNCTION(struct mh_sha1_ctx *ctx, const void *buffer, uint32_t len)
+{
+
+	uint8_t *partial_block_buffer;
+	uint64_t partial_block_len;
+	uint64_t num_blocks;
+	uint32_t(*mh_sha1_segs_digests)[HASH_SEGS];
+	uint8_t *aligned_frame_buffer;
+	const uint8_t *input_data = (const uint8_t *)buffer;
+
+	if (ctx == NULL)
+		return MH_SHA1_CTX_ERROR_NULL;
+
+	if (len == 0)
+		return MH_SHA1_CTX_ERROR_NONE;
+
+	partial_block_len = ctx->total_length % MH_SHA1_BLOCK_SIZE;
+	partial_block_buffer = ctx->partial_block_buffer;
+	aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer);
+	mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests;
+
+	ctx->total_length += len;
+	// No enough input data for mh_sha1 calculation
+	if (len + partial_block_len < MH_SHA1_BLOCK_SIZE) {
+		memcpy(partial_block_buffer + partial_block_len, input_data, len);
+		return MH_SHA1_CTX_ERROR_NONE;
+	}
+	// mh_sha1 calculation for the previous partial block
+	if (partial_block_len != 0) {
+		memcpy(partial_block_buffer + partial_block_len, input_data,
+		       MH_SHA1_BLOCK_SIZE - partial_block_len);
+		//do one_block process
+		MH_SHA1_BLOCK_FUNCTION(partial_block_buffer, mh_sha1_segs_digests,
+				       aligned_frame_buffer, 1);
+		input_data += MH_SHA1_BLOCK_SIZE - partial_block_len;
+		len -= MH_SHA1_BLOCK_SIZE - partial_block_len;
+		memset(partial_block_buffer, 0, MH_SHA1_BLOCK_SIZE);
+	}
+	// Calculate mh_sha1 for the current blocks
+	num_blocks = len / MH_SHA1_BLOCK_SIZE;
+	if (num_blocks > 0) {
+		//do num_blocks process
+		MH_SHA1_BLOCK_FUNCTION(input_data, mh_sha1_segs_digests, aligned_frame_buffer,
+				       num_blocks);
+		len -= num_blocks * MH_SHA1_BLOCK_SIZE;
+		input_data += num_blocks * MH_SHA1_BLOCK_SIZE;
+	}
+	// Store the partial block
+	if (len != 0) {
+		memcpy(partial_block_buffer, input_data, len);
+	}
+
+	return MH_SHA1_CTX_ERROR_NONE;
+
+}
+
+#ifdef MH_SHA1_UPDATE_SLVER
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+
+ // Version info
+struct slver mh_sha1_update_base_slver_0000027a;
+struct slver mh_sha1_update_base_slver = { 0x027a, 0x00, 0x00 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_test.c
new file mode 100644
index 000000000..942dfd09f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_test.c
@@ -0,0 +1,240 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha1.h"
+
+#define TEST_LEN   16*1024
+#define TEST_SIZE   8*1024
+#define TEST_MEM   TEST_LEN
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type)		func##type
+#define FUNC_TOKEN(func, type)		_FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA1_FUNC_TYPE
+#define	MH_SHA1_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION		FUNC_TOKEN(mh_sha1_update, MH_SHA1_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION		FUNC_TOKEN(mh_sha1_finalize, MH_SHA1_FUNC_TYPE)
+
+#define CHECK_RETURN(state)		do{ \
+					  if((state) != MH_SHA1_CTX_ERROR_NONE){ \
+					    printf("The mh_sha1 function is failed.\n"); \
+					    return 1; \
+					  } \
+					}while(0)
+
+extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest);
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 20 == 0)
+			printf("\n");
+	}
+	if (i % 20 != 0)
+		printf("\n");
+}
+
+int compare_digests(uint32_t hash_ref[SHA1_DIGEST_WORDS],
+		    uint32_t hash_test[SHA1_DIGEST_WORDS])
+{
+	int i;
+	int mh_sha1_fail = 0;
+
+	for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+		if (hash_test[i] != hash_ref[i])
+			mh_sha1_fail++;
+	}
+
+	if (mh_sha1_fail) {
+		printf("mh_sha1 fail test\n");
+		printf("ref: ");
+		dump((char *)hash_ref, 20);
+		printf("test: ");
+		dump((char *)hash_test, 20);
+	}
+
+	return mh_sha1_fail;
+}
+
+int main(int argc, char *argv[])
+{
+	int fail = 0, i;
+	uint32_t hash_test[SHA1_DIGEST_WORDS], hash_ref[SHA1_DIGEST_WORDS];
+	uint8_t *buff = NULL;
+	int update_count;
+	int size1, size2, offset, addr_offset;
+	struct mh_sha1_ctx *update_ctx = NULL;
+	uint8_t *mem_addr = NULL;
+
+	printf(xstr(TEST_UPDATE_FUNCTION) "_test:");
+
+	srand(TEST_SEED);
+
+	buff = malloc(TEST_LEN);
+	update_ctx = malloc(sizeof(*update_ctx));
+
+	if (buff == NULL || update_ctx == NULL) {
+		printf("malloc failed test aborted\n");
+		return -1;
+	}
+	// Rand test1
+	rand_buffer(buff, TEST_LEN);
+
+	mh_sha1_ref(buff, TEST_LEN, hash_ref);
+
+	CHECK_RETURN(mh_sha1_init(update_ctx));
+	CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+	CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+	fail = compare_digests(hash_ref, hash_test);
+
+	if (fail) {
+		printf("fail rand1 test\n");
+		return -1;
+	} else
+		putchar('.');
+
+	// Test various size messages by update twice.
+	printf("\n various size messages by update twice tests");
+	for (size1 = TEST_LEN; size1 >= 0; size1--) {
+
+		// Fill with rand data
+		rand_buffer(buff, TEST_LEN);
+
+		mh_sha1_ref(buff, TEST_LEN, hash_ref);
+
+		// subsequent update
+		size2 = TEST_LEN - size1;	// size2 is different with the former
+		CHECK_RETURN(mh_sha1_init(update_ctx));
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size1));
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + size1, size2));
+		CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+		fail = compare_digests(hash_ref, hash_test);
+
+		if (fail) {
+			printf("Fail size1=%d\n", size1);
+			return -1;
+		}
+
+		if ((size2 & 0xff) == 0) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	// Test various update count
+	printf("\n various update count tests");
+	for (update_count = 1; update_count <= TEST_LEN; update_count++) {
+
+		// Fill with rand data
+		rand_buffer(buff, TEST_LEN);
+
+		mh_sha1_ref(buff, TEST_LEN, hash_ref);
+
+		// subsequent update
+		size1 = TEST_LEN / update_count;
+		size2 = TEST_LEN - size1 * (update_count - 1);	// size2 is different with the former
+
+		CHECK_RETURN(mh_sha1_init(update_ctx));
+		for (i = 1, offset = 0; i < update_count; i++) {
+			CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size1));
+			offset += size1;
+		}
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size2));
+		CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+		fail = compare_digests(hash_ref, hash_test);
+
+		if (fail) {
+			printf("Fail size1=%d\n", size1);
+			return -1;
+		}
+
+		if ((size2 & 0xff) == 0) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	// test various start address of ctx.
+	printf("\n various start address of ctx test");
+	free(update_ctx);
+	mem_addr = (uint8_t *) malloc(sizeof(*update_ctx) + AVX512_ALIGNED * 10);
+	for (addr_offset = AVX512_ALIGNED * 10; addr_offset >= 0; addr_offset--) {
+
+		// Fill with rand data
+		rand_buffer(buff, TEST_LEN);
+
+		mh_sha1_ref(buff, TEST_LEN, hash_ref);
+
+		// a unaligned offset
+		update_ctx = (struct mh_sha1_ctx *)(mem_addr + addr_offset);
+		CHECK_RETURN(mh_sha1_init(update_ctx));
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+		CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+		fail = compare_digests(hash_ref, hash_test);
+
+		if (fail) {
+			printf("Fail addr_offset=%d\n", addr_offset);
+			return -1;
+		}
+
+		if ((addr_offset & 0xf) == 0) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	printf("\n" xstr(TEST_UPDATE_FUNCTION) "_test: %s\n", fail == 0 ? "Pass" : "Fail");
+
+	return fail;
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/sha1_for_mh_sha1.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/sha1_for_mh_sha1.c
new file mode 100644
index 000000000..224977e6c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/sha1_for_mh_sha1.c
@@ -0,0 +1,204 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "mh_sha1_internal.h"
+#include <string.h>
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA1 Functions for mh_sha1
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define W(x) w[(x) & 15]
+
+#define step00_19(i,a,b,c,d,e) \
+	if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+	else W(i) = to_be32(ww[i]); \
+	e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \
+	b = rol32(b,30)
+
+#define step20_39(i,a,b,c,d,e) \
+	W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+	e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \
+	b = rol32(b,30)
+
+#define step40_59(i,a,b,c,d,e) \
+	W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+	e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \
+	b = rol32(b,30)
+
+#define step60_79(i,a,b,c,d,e) \
+	W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+	e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \
+	b = rol32(b,30)
+
+static void OPT_FIX sha1_single_for_mh_sha1(const uint8_t * data, uint32_t digest[])
+{
+	uint32_t a, b, c, d, e;
+	uint32_t w[16] = { 0 };
+	uint32_t *ww = (uint32_t *) data;
+
+	a = digest[0];
+	b = digest[1];
+	c = digest[2];
+	d = digest[3];
+	e = digest[4];
+
+	step00_19(0, a, b, c, d, e);
+	step00_19(1, e, a, b, c, d);
+	step00_19(2, d, e, a, b, c);
+	step00_19(3, c, d, e, a, b);
+	step00_19(4, b, c, d, e, a);
+	step00_19(5, a, b, c, d, e);
+	step00_19(6, e, a, b, c, d);
+	step00_19(7, d, e, a, b, c);
+	step00_19(8, c, d, e, a, b);
+	step00_19(9, b, c, d, e, a);
+	step00_19(10, a, b, c, d, e);
+	step00_19(11, e, a, b, c, d);
+	step00_19(12, d, e, a, b, c);
+	step00_19(13, c, d, e, a, b);
+	step00_19(14, b, c, d, e, a);
+	step00_19(15, a, b, c, d, e);
+	step00_19(16, e, a, b, c, d);
+	step00_19(17, d, e, a, b, c);
+	step00_19(18, c, d, e, a, b);
+	step00_19(19, b, c, d, e, a);
+
+	step20_39(20, a, b, c, d, e);
+	step20_39(21, e, a, b, c, d);
+	step20_39(22, d, e, a, b, c);
+	step20_39(23, c, d, e, a, b);
+	step20_39(24, b, c, d, e, a);
+	step20_39(25, a, b, c, d, e);
+	step20_39(26, e, a, b, c, d);
+	step20_39(27, d, e, a, b, c);
+	step20_39(28, c, d, e, a, b);
+	step20_39(29, b, c, d, e, a);
+	step20_39(30, a, b, c, d, e);
+	step20_39(31, e, a, b, c, d);
+	step20_39(32, d, e, a, b, c);
+	step20_39(33, c, d, e, a, b);
+	step20_39(34, b, c, d, e, a);
+	step20_39(35, a, b, c, d, e);
+	step20_39(36, e, a, b, c, d);
+	step20_39(37, d, e, a, b, c);
+	step20_39(38, c, d, e, a, b);
+	step20_39(39, b, c, d, e, a);
+
+	step40_59(40, a, b, c, d, e);
+	step40_59(41, e, a, b, c, d);
+	step40_59(42, d, e, a, b, c);
+	step40_59(43, c, d, e, a, b);
+	step40_59(44, b, c, d, e, a);
+	step40_59(45, a, b, c, d, e);
+	step40_59(46, e, a, b, c, d);
+	step40_59(47, d, e, a, b, c);
+	step40_59(48, c, d, e, a, b);
+	step40_59(49, b, c, d, e, a);
+	step40_59(50, a, b, c, d, e);
+	step40_59(51, e, a, b, c, d);
+	step40_59(52, d, e, a, b, c);
+	step40_59(53, c, d, e, a, b);
+	step40_59(54, b, c, d, e, a);
+	step40_59(55, a, b, c, d, e);
+	step40_59(56, e, a, b, c, d);
+	step40_59(57, d, e, a, b, c);
+	step40_59(58, c, d, e, a, b);
+	step40_59(59, b, c, d, e, a);
+
+	step60_79(60, a, b, c, d, e);
+	step60_79(61, e, a, b, c, d);
+	step60_79(62, d, e, a, b, c);
+	step60_79(63, c, d, e, a, b);
+	step60_79(64, b, c, d, e, a);
+	step60_79(65, a, b, c, d, e);
+	step60_79(66, e, a, b, c, d);
+	step60_79(67, d, e, a, b, c);
+	step60_79(68, c, d, e, a, b);
+	step60_79(69, b, c, d, e, a);
+	step60_79(70, a, b, c, d, e);
+	step60_79(71, e, a, b, c, d);
+	step60_79(72, d, e, a, b, c);
+	step60_79(73, c, d, e, a, b);
+	step60_79(74, b, c, d, e, a);
+	step60_79(75, a, b, c, d, e);
+	step60_79(76, e, a, b, c, d);
+	step60_79(77, d, e, a, b, c);
+	step60_79(78, c, d, e, a, b);
+	step60_79(79, b, c, d, e, a);
+
+	digest[0] += a;
+	digest[1] += b;
+	digest[2] += c;
+	digest[3] += d;
+	digest[4] += e;
+}
+
+void sha1_for_mh_sha1(const uint8_t * input_data, uint32_t * digest, const uint32_t len)
+{
+	uint32_t i, j;
+	uint8_t buf[2 * SHA1_BLOCK_SIZE];
+
+	digest[0] = MH_SHA1_H0;
+	digest[1] = MH_SHA1_H1;
+	digest[2] = MH_SHA1_H2;
+	digest[3] = MH_SHA1_H3;
+	digest[4] = MH_SHA1_H4;
+
+	i = len;
+	while (i >= SHA1_BLOCK_SIZE) {
+		sha1_single_for_mh_sha1(input_data, digest);
+		input_data += SHA1_BLOCK_SIZE;
+		i -= SHA1_BLOCK_SIZE;
+	}
+
+	memcpy(buf, input_data, i);
+	buf[i++] = 0x80;
+	for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - 8); j++)
+		buf[j] = 0;
+
+	if (i > SHA1_BLOCK_SIZE - 8)
+		i = 2 * SHA1_BLOCK_SIZE;
+	else
+		i = SHA1_BLOCK_SIZE;
+
+	*(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+	sha1_single_for_mh_sha1(buf, digest);
+	if (i == (2 * SHA1_BLOCK_SIZE))
+		sha1_single_for_mh_sha1(buf + SHA1_BLOCK_SIZE, digest);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/Makefile.am b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/Makefile.am
new file mode 100644
index 000000000..e6ea6784c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/Makefile.am
@@ -0,0 +1,89 @@
+########################################################################
+#  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_murmur   =	mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c
+
+lsrc_stitch   = mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c \
+		mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c \
+		mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c \
+		mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm \
+		mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm \
+		mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm \
+		mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm
+
+lsrc_stitch  += mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c \
+		mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm
+
+lsrc_x86_64  += $(lsrc_murmur) \
+		$(lsrc_stitch)
+
+lsrc_x86_32  += $(lsrc_x86_64)
+
+lsrc_aarch64 += $(lsrc_murmur)	\
+		mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c \
+		mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c \
+		mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c \
+		mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_dispatcher.c \
+		mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_ce.c \
+		mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_ce.S \
+		mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_asimd.c \
+		mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_asimd.S \
+		mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_multibinary.S
+
+lsrc_base_aliases += $(lsrc_murmur)	\
+		mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c \
+		mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c \
+		mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c \
+		mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_base_aliases.c
+
+other_src += 	include/reg_sizes.asm \
+		include/multibinary.asm \
+		include/test.h \
+		mh_sha1/mh_sha1_internal.h \
+		mh_sha1_murmur3_x64_128/murmur3_x64_128.c \
+		mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h
+
+src_include += -I $(srcdir)/mh_sha1_murmur3_x64_128
+
+extern_hdrs +=	include/mh_sha1_murmur3_x64_128.h
+
+unit_tests  += 	mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test \
+		mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test
+
+perf_tests  += 	mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf
+
+
+mh_sha1_murmur3_x64_128_test: mh_sha1_ref.o murmur3_x64_128.o
+mh_sha1_murmur3_x64_128_mh_sha1_murmur3_x64_128_test_LDADD = mh_sha1/mh_sha1_ref.lo mh_sha1_murmur3_x64_128/murmur3_x64_128.lo libisal_crypto.la
+
+mh_sha1_murmur3_x64_128_update_test: mh_sha1_ref.o murmur3_x64_128.o
+mh_sha1_murmur3_x64_128_mh_sha1_murmur3_x64_128_update_test_LDADD = mh_sha1/mh_sha1_ref.lo mh_sha1_murmur3_x64_128/murmur3_x64_128.lo libisal_crypto.la
+
+mh_sha1_murmur3_x64_128_perf: mh_sha1_ref.o murmur3_x64_128.o
+mh_sha1_murmur3_x64_128_mh_sha1_murmur3_x64_128_perf_LDADD = mh_sha1/mh_sha1_ref.lo mh_sha1_murmur3_x64_128/murmur3_x64_128.lo libisal_crypto.la
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_dispatcher.c
new file mode 100644
index 000000000..e6993703a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_dispatcher.c
@@ -0,0 +1,53 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(mh_sha1_murmur3_x64_128_update)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SHA1)
+		return PROVIDER_INFO(mh_sha1_murmur3_update_ce);
+
+	if (auxval & HWCAP_ASIMD)
+		return PROVIDER_INFO(mh_sha1_murmur3_update_asimd);
+
+	return PROVIDER_BASIC(mh_sha1_murmur3_x64_128_update);
+}
+
+DEFINE_INTERFACE_DISPATCHER(mh_sha1_murmur3_x64_128_finalize)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SHA1)
+		return PROVIDER_INFO(mh_sha1_murmur3_finalize_ce);
+
+	if (auxval & HWCAP_ASIMD)
+		return PROVIDER_INFO(mh_sha1_murmur3_finalize_asimd);
+
+	return PROVIDER_BASIC(mh_sha1_murmur3_x64_128_finalize);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_internal.h b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_internal.h
new file mode 100644
index 000000000..22b33cbd2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_internal.h
@@ -0,0 +1,91 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA1_MURMUR3_AARCH64_INTERNAL_H_
+#define _MH_SHA1_MURMUR3_AARCH64_INTERNAL_H_
+
+/**
+ *  @file mh_sha1_murmur3_aarch64_internal.h
+ *  @brief mh_sha1_murmur3_aarch64 internal function prototypes and macros
+ *
+ *  Interface for mh_sha1_murmur3_aarch64 internal functions
+ *
+ */
+#include <stdint.h>
+#include "mh_sha1_murmur3_x64_128_internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ /**
+  * @brief  Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+  *
+  * @requires Crypto Extension
+  *
+  * @param  input_data Pointer to input data to be processed
+  * @param  mh_sha1_digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  murmur3_x64_128_digests Murmur3 digest
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+void mh_sha1_murmur3_block_ce(const uint8_t * input_data,
+				      uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+				      uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+				      uint32_t
+				      murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+				      uint32_t num_blocks);
+
+ /**
+  * @brief  Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+  *
+  * @requires ASIMD
+  *
+  * @param  input_data Pointer to input data to be processed
+  * @param  mh_sha1_digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  murmur3_x64_128_digests Murmur3 digest
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+void mh_sha1_murmur3_block_asimd(const uint8_t * input_data,
+				      uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+				      uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+				      uint32_t
+				      murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+				      uint32_t num_blocks);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_asimd.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_asimd.c
new file mode 100644
index 000000000..9cac8504e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_asimd.c
@@ -0,0 +1,54 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_murmur3_aarch64_internal.h"
+
+extern void mh_sha1_tail_asimd(uint8_t * partial_buffer, uint32_t total_len,
+			       uint32_t(*mh_sha1_segs_digests)[HASH_SEGS],
+			       uint8_t * frame_buffer,
+			       uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+extern void mh_sha1_block_asimd(const uint8_t * input_data,
+				uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+				uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+// mh_sha1_murmur3_update_asimd.c
+#define UPDATE_FUNCTION mh_sha1_murmur3_update_asimd
+#define BLOCK_FUNCTION	mh_sha1_murmur3_block_asimd
+#include "mh_sha1_murmur3_x64_128_update_base.c"
+#undef UPDATE_FUNCTION
+#undef BLOCK_FUNCTION
+
+// mh_sha1_murmur3_finalize_asimd.c
+#define FINALIZE_FUNCTION mh_sha1_murmur3_finalize_asimd
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_asimd
+#include "mh_sha1_murmur3_x64_128_finalize_base.c"
+#undef FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_asimd.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_asimd.S
new file mode 100644
index 000000000..575129f36
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_asimd.S
@@ -0,0 +1,224 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+	.arch armv8-a
+
+#include "sha1_asimd_common.S"
+.macro sha1_step_16_79_interleave0 windex:req
+	// interleaving murmur3 operation
+	.if (\windex % 4) == 0
+		ldp mur_data1, mur_data2, [mur_data], #16
+	.endif
+	.if (\windex % 4) == 1
+		/* rotate left by 31 bits */
+		ror	mur_data1, mur_data1, #64-31
+		/* rotate left by 33 bits */
+		ror	mur_data2, mur_data2, #64-33
+	.endif
+	.if (\windex % 4) == 2
+		eor	mur_hash1, mur_hash1, mur_data1
+		/* rotate left by 27 bits */
+		ror	mur_hash1, mur_hash1, #64-27
+	.endif
+	.if (\windex % 4) == 3
+		eor	mur_hash2, mur_hash2, mur_data2
+		/* rotate left by 31 bits */
+		ror	mur_hash2, mur_hash2, #64-31
+	.endif
+.endm
+
+.macro sha1_step_16_79_interleave1 windex:req
+	// interleaving murmur3 operation
+	.if (\windex % 4) == 0
+		mul	mur_data1, mur_data1, mur_c1
+		mul	mur_data2, mur_data2, mur_c2
+	.endif
+	.if (\windex % 4) == 1
+		mul	mur_data1, mur_data1, mur_c2
+		mul	mur_data2, mur_data2, mur_c1
+	.endif
+	.if (\windex % 4) == 2
+		add	mur_hash1, mur_hash1, mur_hash2
+		//mur_hash1 = mur_hash1 * 5 + N1
+		add	mur_hash1, mur_hash1, mur_hash1, LSL #2
+		add	mur_hash1, mur_n1, mur_hash1
+	.endif
+	.if (\windex % 4) == 3
+		add	mur_hash2, mur_hash2, mur_hash1
+		// mur_hash2 = mur_hash2 * 5 + N2
+		add	mur_hash2, mur_hash2, mur_hash2, LSL #2
+		add	mur_hash2, mur_n2, mur_hash2
+	.endif
+.endm
+
+.macro load_x4_word idx:req
+	ld1 {WORD\idx\().16b},[segs_ptr]
+	add segs_ptr,segs_ptr,#64
+.endm
+
+/*
+ * void mh_sha1_murmur3_block_asimd (const uint8_t * input_data,
+ *                               uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ *                               uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ *                               uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ *                               uint32_t num_blocks);
+ * arg 0 pointer to input data
+ * arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+ * arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+ * arg 3 pointer to murmur3 digest
+ * arg 4 number  of 1KB blocks
+ */
+
+	input_data	.req	x0
+	sha1_digest	.req	x1
+	data_buf	.req	x2
+	mur_digest	.req	x3
+	num_blocks	.req	w4
+
+	src	.req	x5
+	dst	.req	x6
+	offs	.req	x7
+	mh_segs	.req	x8
+	tmp	.req	x9
+	tmpw	.req	w9
+	segs_ptr	.req	x10
+	mur_hash1	.req	x11
+	mur_hash2	.req	x12
+	mur_c1	.req	x13
+	mur_c2	.req	x14
+	mur_data1	.req	x19
+	mur_data2	.req	x20
+	mur_data	.req	x21
+	mur_n1		.req	x22
+	mur_n1_w	.req	w22
+	mur_n2		.req	x23
+	mur_n2_w	.req	w23
+	block_ctr	.req	w24
+
+	.global mh_sha1_murmur3_block_asimd
+	.type mh_sha1_murmur3_block_asimd, %function
+mh_sha1_murmur3_block_asimd:
+	cmp	num_blocks, #0
+	beq	.return
+	sha1_asimd_save_stack
+	stp	x19, x20, [sp, -48]!
+	stp	x21, x22, [sp, 16]
+	stp	x23, x24, [sp, 32]
+
+	mov	mur_data, input_data
+	ldr	mur_hash1, [mur_digest]
+	ldr	mur_hash2, [mur_digest, 8]
+	adr	mur_c1, C1
+	ldr	mur_c1, [mur_c1]
+	adr	mur_c2, C2
+	ldr	mur_c2, [mur_c2]
+	adr	tmp, N1
+	ldr	mur_n1_w, [tmp]
+	adr	tmp, N2
+	ldr	mur_n2_w, [tmp]
+
+	mov	mh_segs, #0
+.seg_loops:
+	add	segs_ptr,input_data,mh_segs
+	mov	offs, #64
+	add	src, sha1_digest, mh_segs
+	ld1	{VA.4S}, [src], offs
+	ld1	{VB.4S}, [src], offs
+	ld1	{VC.4S}, [src], offs
+	ld1	{VD.4S}, [src], offs
+	ld1	{VE.4S}, [src], offs
+	mov	block_ctr,num_blocks
+
+.block_loop:
+	sha1_single
+	subs	block_ctr, block_ctr, 1
+	bne	.block_loop
+
+	mov	offs, #64
+	add	dst, sha1_digest, mh_segs
+	st1	{VA.4S}, [dst], offs
+	st1	{VB.4S}, [dst], offs
+	st1	{VC.4S}, [dst], offs
+	st1	{VD.4S}, [dst], offs
+	st1	{VE.4S}, [dst], offs
+
+	add	mh_segs, mh_segs, #16
+	cmp	mh_segs, #64
+	bne	.seg_loops
+
+	/* save murmur-hash digest */
+	str	mur_hash1, [mur_digest], #8
+	str	mur_hash2, [mur_digest]
+
+	ldp	x21, x22, [sp, 16]
+	ldp	x23, x24, [sp, 32]
+	ldp	x19, x20, [sp], 48
+	sha1_asimd_restore_stack
+.return:
+	ret
+
+	.size mh_sha1_murmur3_block_asimd, .-mh_sha1_murmur3_block_asimd
+	.section .rodata.cst16,"aM",@progbits,16
+	.align  16
+KEY_0:
+	.word	0x5a827999
+	.word	0x5a827999
+	.word	0x5a827999
+	.word	0x5a827999
+KEY_1:
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+KEY_2:
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+KEY_3:
+	.word	0xca62c1d6
+	.word	0xca62c1d6
+	.word	0xca62c1d6
+	.word	0xca62c1d6
+N1:
+	.word	0x52dce729
+	.word	0x52dce729
+	.word	0x52dce729
+	.word	0x52dce729
+N2:
+	.word	0x38495ab5
+	.word	0x38495ab5
+	.word	0x38495ab5
+	.word	0x38495ab5
+C1:
+	.dword	0x87c37b91114253d5
+	.dword	0x87c37b91114253d5
+C2:
+	.dword	0x4cf5ad432745937f
+	.dword	0x4cf5ad432745937f
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_ce.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_ce.S
new file mode 100644
index 000000000..7f4256e20
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_ce.S
@@ -0,0 +1,482 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+	.align	2
+	.p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro	declare_var_vector_reg name:req,reg:req
+	\name\()_q	.req	q\reg
+	\name\()_v	.req	v\reg
+	\name\()_s	.req	s\reg
+.endm
+
+
+
+/*
+Variable list
+*/
+
+	declare_var_vector_reg	lane0_msg_0, 0
+	declare_var_vector_reg	lane1_msg_0, 1
+	declare_var_vector_reg	lane2_msg_0, 2
+	declare_var_vector_reg	lane3_msg_0, 3
+	declare_var_vector_reg	lane0_msg_1, 4
+	declare_var_vector_reg	lane1_msg_1, 5
+	declare_var_vector_reg	lane2_msg_1, 6
+	declare_var_vector_reg	lane3_msg_1, 7
+	declare_var_vector_reg	lane0_msg_2, 8
+	declare_var_vector_reg	lane1_msg_2, 9
+	declare_var_vector_reg	lane2_msg_2,10
+	declare_var_vector_reg	lane3_msg_2,11
+	declare_var_vector_reg	lane0_msg_3,12
+	declare_var_vector_reg	lane1_msg_3,13
+	declare_var_vector_reg	lane2_msg_3,14
+	declare_var_vector_reg	lane3_msg_3,15
+
+	declare_var_vector_reg	lane0_abcd ,16
+	declare_var_vector_reg	lane1_abcd ,17
+	declare_var_vector_reg	lane2_abcd ,18
+	declare_var_vector_reg	lane3_abcd ,19
+	declare_var_vector_reg	lane0_tmp0 ,20
+	declare_var_vector_reg	lane1_tmp0 ,21
+	declare_var_vector_reg	lane2_tmp0 ,22
+	declare_var_vector_reg	lane3_tmp0 ,23
+	declare_var_vector_reg	lane0_tmp1 ,24
+	declare_var_vector_reg	lane1_tmp1 ,25
+	declare_var_vector_reg	lane2_tmp1 ,26
+	declare_var_vector_reg	lane3_tmp1 ,27
+
+
+	declare_var_vector_reg	e0	   ,28
+	declare_var_vector_reg	e1	   ,29
+	declare_var_vector_reg	key	   ,30
+	declare_var_vector_reg	tmp	   ,31
+
+	key_adr		.req	x5
+	msg_adr		.req	x6
+	block_cnt	.req	x7
+	offs		.req	x8
+	mur_n1		.req    x9
+	mur_n1_w	.req    w9
+	mur_n2		.req    x10
+	mur_n2_w	.req    w10
+	mur_hash1	.req    x11
+	mur_hash2	.req    x12
+	mur_c1		.req    x13
+	mur_c2		.req    x14
+	mur_data1	.req    x15
+
+	digest_adr	.req	x16
+	tmp0_adr	.req	x17
+	tmp1_adr	.req	x18
+	mur_data2	.req    x19
+	mur_data	.req    x20
+
+.macro murmur3_00
+	ldp mur_data1, mur_data2, [mur_data], #16
+	mul mur_data1, mur_data1, mur_c1
+	mul mur_data2, mur_data2, mur_c2
+.endm
+
+.macro murmur3_01
+	/* rotate left by 31 bits */
+	ror mur_data1, mur_data1, #64-31
+	/* rotate left by 33 bits */
+	ror mur_data2, mur_data2, #64-33
+	mul mur_data1, mur_data1, mur_c2
+	mul mur_data2, mur_data2, mur_c1
+.endm
+
+.macro murmur3_02
+	eor mur_hash1, mur_hash1, mur_data1
+	/* rotate left by 27 bits */
+	ror mur_hash1, mur_hash1, #64-27
+	add mur_hash1, mur_hash1, mur_hash2
+	// mur_hash1 = mur_hash1 * 5 + N1
+	add mur_hash1, mur_hash1, mur_hash1, LSL #2
+	add mur_hash1, mur_n1, mur_hash1
+.endm
+
+.macro murmur3_03
+	eor mur_hash2, mur_hash2, mur_data2
+	/* rotate left by 31 bits */
+	ror mur_hash2, mur_hash2, #64-31
+	add mur_hash2, mur_hash2, mur_hash1
+	// mur_hash2 = mur_hash2 * 5 + N2
+	add mur_hash2, mur_hash2, mur_hash2, LSL #2
+	add mur_hash2, mur_n2, mur_hash2
+.endm
+
+/**
+ * maros for round 4-67
+ * the code execute 16 times per block, allowing the inserted murmur3 operation to process 256 bytes
+*/
+.macro sha1_4_rounds inst:req,msg0:req,msg1:req,msg2:req,msg3:req,abcd:req,e0:req,tmp0:req,e1:req,tmp1:req
+	sha1h	lane0_\tmp0\()_s, lane0_\abcd\()_s
+	sha1h	lane1_\tmp0\()_s, lane1_\abcd\()_s
+	sha1h	lane2_\tmp0\()_s, lane2_\abcd\()_s
+	sha1h	lane3_\tmp0\()_s, lane3_\abcd\()_s
+	mov	\e0\()_v.S[0],lane0_\tmp0\()_v.S[0]
+	mov	\e0\()_v.S[1],lane1_\tmp0\()_v.S[0]
+	mov	\e0\()_v.S[2],lane2_\tmp0\()_v.S[0]
+	mov	\e0\()_v.S[3],lane3_\tmp0\()_v.S[0]
+	mov	lane0_\tmp0\()_v.S[0],\e1\()_v.S[0]
+	mov	lane1_\tmp0\()_v.S[0],\e1\()_v.S[1]
+	mov	lane2_\tmp0\()_v.S[0],\e1\()_v.S[2]
+	mov	lane3_\tmp0\()_v.S[0],\e1\()_v.S[3]
+	\inst	lane0_\abcd\()_q,lane0_\tmp0\()_s,lane0_\tmp1\()_v.4s
+	murmur3_00
+	\inst	lane1_\abcd\()_q,lane1_\tmp0\()_s,lane1_\tmp1\()_v.4s
+	murmur3_01
+	\inst	lane2_\abcd\()_q,lane2_\tmp0\()_s,lane2_\tmp1\()_v.4s
+	murmur3_02
+	\inst	lane3_\abcd\()_q,lane3_\tmp0\()_s,lane3_\tmp1\()_v.4s
+	murmur3_03
+	ld1	{lane0_\tmp0\()_v.4s-lane3_\tmp0\()_v.4s},[\tmp0\()_adr]
+	add 	lane0_\tmp1\()_v.4s,lane0_\msg3\()_v.4s,key_v.4s
+	add 	lane1_\tmp1\()_v.4s,lane1_\msg3\()_v.4s,key_v.4s
+	add 	lane2_\tmp1\()_v.4s,lane2_\msg3\()_v.4s,key_v.4s
+	add 	lane3_\tmp1\()_v.4s,lane3_\msg3\()_v.4s,key_v.4s
+	st1	{lane0_\tmp1\()_v.4s-lane3_\tmp1\()_v.4s},[\tmp1\()_adr]
+	sha1su1	lane0_\msg0\()_v.4s,lane0_\msg3\()_v.4s
+	sha1su1	lane1_\msg0\()_v.4s,lane1_\msg3\()_v.4s
+	sha1su1	lane2_\msg0\()_v.4s,lane2_\msg3\()_v.4s
+	sha1su1	lane3_\msg0\()_v.4s,lane3_\msg3\()_v.4s
+	sha1su0	lane0_\msg1\()_v.4s,lane0_\msg2\()_v.4s,lane0_\msg3\()_v.4s
+	sha1su0	lane1_\msg1\()_v.4s,lane1_\msg2\()_v.4s,lane1_\msg3\()_v.4s
+	sha1su0	lane2_\msg1\()_v.4s,lane2_\msg2\()_v.4s,lane2_\msg3\()_v.4s
+	sha1su0	lane3_\msg1\()_v.4s,lane3_\msg2\()_v.4s,lane3_\msg3\()_v.4s
+.endm
+
+
+/*
+ * void mh_sha1_murmur3_block_ce (const uint8_t * input_data,
+ *                               uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ *                               uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ *                               uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ *                               uint32_t num_blocks);
+ * arg 0 pointer to input data
+ * arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+ * arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+ * arg 3 pointer to murmur3 digest
+ * arg 4 number  of 1KB blocks
+ */
+
+/*
+Arguements list
+*/
+	input_data 	.req	x0
+	digests		.req	x1
+	frame_buffer	.req	x2
+	mur_digest	.req    x3
+	num_blocks	.req	w4
+
+	.global	mh_sha1_murmur3_block_ce
+	.type	mh_sha1_murmur3_block_ce, %function
+mh_sha1_murmur3_block_ce:
+	// save temp vector registers
+	stp	d8, d9, [sp, -80]!
+
+	stp	d10, d11, [sp, 16]
+	stp	d12, d13, [sp, 32]
+	stp	d14, d15, [sp, 48]
+	stp	x19, x20, [sp, 64]
+
+	mov	mur_data, input_data
+	ldr	mur_hash1, [mur_digest]
+	ldr	mur_hash2, [mur_digest, 8]
+	adr	mur_c1, C1
+	ldr	mur_c1, [mur_c1]
+	adr	mur_c2, C2
+	ldr	mur_c2, [mur_c2]
+	adr	tmp0_adr, N1
+	ldr	mur_n1_w, [tmp0_adr]
+	adr	tmp0_adr, N2
+	ldr	mur_n2_w, [tmp0_adr]
+
+	mov	tmp0_adr,frame_buffer
+	add	tmp1_adr,tmp0_adr,128
+
+
+start_loop:
+	mov	block_cnt,0
+	mov	msg_adr,input_data
+lane_loop:
+	mov	offs,64
+	adr	key_adr,KEY_0
+	// load msg 0
+	ld4	{lane0_msg_0_v.S-lane3_msg_0_v.S}[0],[msg_adr],offs
+	ld4	{lane0_msg_0_v.S-lane3_msg_0_v.S}[1],[msg_adr],offs
+	ld4	{lane0_msg_0_v.S-lane3_msg_0_v.S}[2],[msg_adr],offs
+	ld4	{lane0_msg_0_v.S-lane3_msg_0_v.S}[3],[msg_adr],offs
+
+	ld4	{lane0_msg_1_v.S-lane3_msg_1_v.S}[0],[msg_adr],offs
+	ld4	{lane0_msg_1_v.S-lane3_msg_1_v.S}[1],[msg_adr],offs
+	ld4	{lane0_msg_1_v.S-lane3_msg_1_v.S}[2],[msg_adr],offs
+	ld4	{lane0_msg_1_v.S-lane3_msg_1_v.S}[3],[msg_adr],offs
+
+	ld4	{lane0_msg_2_v.S-lane3_msg_2_v.S}[0],[msg_adr],offs
+	ld4	{lane0_msg_2_v.S-lane3_msg_2_v.S}[1],[msg_adr],offs
+	ld4	{lane0_msg_2_v.S-lane3_msg_2_v.S}[2],[msg_adr],offs
+	ld4	{lane0_msg_2_v.S-lane3_msg_2_v.S}[3],[msg_adr],offs
+
+	ld4	{lane0_msg_3_v.S-lane3_msg_3_v.S}[0],[msg_adr],offs
+	ld4	{lane0_msg_3_v.S-lane3_msg_3_v.S}[1],[msg_adr],offs
+	ld4	{lane0_msg_3_v.S-lane3_msg_3_v.S}[2],[msg_adr],offs
+	ld4	{lane0_msg_3_v.S-lane3_msg_3_v.S}[3],[msg_adr],offs
+
+	add	digest_adr,digests,block_cnt
+	ld4	{lane0_abcd_v.S-lane3_abcd_v.S}[0],[digest_adr],offs
+	ld4	{lane0_abcd_v.S-lane3_abcd_v.S}[1],[digest_adr],offs
+	ld4	{lane0_abcd_v.S-lane3_abcd_v.S}[2],[digest_adr],offs
+	ld4	{lane0_abcd_v.S-lane3_abcd_v.S}[3],[digest_adr],offs
+	ldr	e0_q,[digest_adr]
+
+	// load key_0
+	ldr	key_q,[key_adr]
+
+	rev32	lane0_msg_0_v.16b,lane0_msg_0_v.16b
+	rev32	lane1_msg_0_v.16b,lane1_msg_0_v.16b
+	rev32	lane2_msg_0_v.16b,lane2_msg_0_v.16b
+	rev32	lane3_msg_0_v.16b,lane3_msg_0_v.16b
+	rev32	lane0_msg_1_v.16b,lane0_msg_1_v.16b
+	rev32	lane1_msg_1_v.16b,lane1_msg_1_v.16b
+	rev32	lane2_msg_1_v.16b,lane2_msg_1_v.16b
+	rev32	lane3_msg_1_v.16b,lane3_msg_1_v.16b
+	rev32	lane0_msg_2_v.16b,lane0_msg_2_v.16b
+	rev32	lane1_msg_2_v.16b,lane1_msg_2_v.16b
+	rev32	lane2_msg_2_v.16b,lane2_msg_2_v.16b
+	rev32	lane3_msg_2_v.16b,lane3_msg_2_v.16b
+	rev32	lane0_msg_3_v.16b,lane0_msg_3_v.16b
+	rev32	lane1_msg_3_v.16b,lane1_msg_3_v.16b
+	rev32	lane2_msg_3_v.16b,lane2_msg_3_v.16b
+	rev32	lane3_msg_3_v.16b,lane3_msg_3_v.16b
+
+	add	lane0_tmp1_v.4s,lane0_msg_1_v.4s,key_v.4s
+	add	lane1_tmp1_v.4s,lane1_msg_1_v.4s,key_v.4s
+	add	lane2_tmp1_v.4s,lane2_msg_1_v.4s,key_v.4s
+	add	lane3_tmp1_v.4s,lane3_msg_1_v.4s,key_v.4s
+	st1	{lane0_tmp1_v.4s-lane3_tmp1_v.4s},[tmp1_adr]
+
+	add	lane0_tmp0_v.4s,lane0_msg_0_v.4s,key_v.4s
+	add	lane1_tmp0_v.4s,lane1_msg_0_v.4s,key_v.4s
+	add	lane2_tmp0_v.4s,lane2_msg_0_v.4s,key_v.4s
+	add	lane3_tmp0_v.4s,lane3_msg_0_v.4s,key_v.4s
+
+	/* rounds 0-3 */
+	sha1h	lane0_tmp1_s,lane0_abcd_s
+	sha1h	lane1_tmp1_s,lane1_abcd_s
+	sha1h	lane2_tmp1_s,lane2_abcd_s
+	sha1h	lane3_tmp1_s,lane3_abcd_s
+	mov	e1_v.S[0],lane0_tmp1_v.S[0]
+	mov	e1_v.S[1],lane1_tmp1_v.S[0]
+	mov	e1_v.S[2],lane2_tmp1_v.S[0]
+	mov	e1_v.S[3],lane3_tmp1_v.S[0]
+	mov	lane0_tmp1_v.S[0],e0_v.S[0]
+	mov	lane1_tmp1_v.S[0],e0_v.S[1]
+	mov	lane2_tmp1_v.S[0],e0_v.S[2]
+	mov	lane3_tmp1_v.S[0],e0_v.S[3]
+	sha1c	lane0_abcd_q,lane0_tmp1_s,lane0_tmp0_v.4s
+	sha1c	lane1_abcd_q,lane1_tmp1_s,lane1_tmp0_v.4s
+	sha1c	lane2_abcd_q,lane2_tmp1_s,lane2_tmp0_v.4s
+	sha1c	lane3_abcd_q,lane3_tmp1_s,lane3_tmp0_v.4s
+	ld1	{lane0_tmp1_v.4s-lane3_tmp1_v.4s},[tmp1_adr]
+	add	lane0_tmp0_v.4s,lane0_msg_2_v.4s,key_v.4s
+	sha1su0	lane0_msg_0_v.4s,lane0_msg_1_v.4s,lane0_msg_2_v.4s
+	add	lane1_tmp0_v.4s,lane1_msg_2_v.4s,key_v.4s
+	sha1su0	lane1_msg_0_v.4s,lane1_msg_1_v.4s,lane1_msg_2_v.4s
+	add	lane2_tmp0_v.4s,lane2_msg_2_v.4s,key_v.4s
+	sha1su0	lane2_msg_0_v.4s,lane2_msg_1_v.4s,lane2_msg_2_v.4s
+	add	lane3_tmp0_v.4s,lane3_msg_2_v.4s,key_v.4s
+	sha1su0	lane3_msg_0_v.4s,lane3_msg_1_v.4s,lane3_msg_2_v.4s
+	st1	{lane0_tmp0_v.4s-lane3_tmp0_v.4s},[tmp0_adr]
+
+	sha1_4_rounds	sha1c,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1     /* rounds 4-7 */
+	sha1_4_rounds	sha1c,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
+
+
+	adr	key_adr,KEY_1
+	ldr	key_q,[key_adr]
+	sha1_4_rounds	sha1c,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1     /* rounds 12-15 */
+	sha1_4_rounds	sha1c,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
+	sha1_4_rounds	sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1     /* rounds 20-23 */
+	sha1_4_rounds	sha1p,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
+	sha1_4_rounds	sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1
+
+	adr	key_adr,KEY_2
+	ldr	key_q,[key_adr]
+	sha1_4_rounds	sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
+	sha1_4_rounds	sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1     /* rounds 36-39 */
+	sha1_4_rounds	sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
+	sha1_4_rounds	sha1m,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1
+	sha1_4_rounds	sha1m,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
+
+	adr	key_adr,KEY_3
+	ldr	key_q,[key_adr]
+	sha1_4_rounds	sha1m,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1     /* rounds 52-55 */
+	sha1_4_rounds	sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
+	sha1_4_rounds	sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1
+	sha1_4_rounds	sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
+
+	// msg2 and msg1 are free
+	mov	lane0_msg_2_v.S[0],e1_v.S[0]
+	mov	lane1_msg_2_v.S[0],e1_v.S[1]
+	mov	lane2_msg_2_v.S[0],e1_v.S[2]
+	mov	lane3_msg_2_v.S[0],e1_v.S[3]
+
+	/* rounds 68-71 */
+	sha1h	lane0_msg_1_s,lane0_abcd_s
+	sha1h	lane1_msg_1_s,lane1_abcd_s
+	sha1h	lane2_msg_1_s,lane2_abcd_s
+	sha1h	lane3_msg_1_s,lane3_abcd_s
+	sha1p	lane0_abcd_q,lane0_msg_2_s,lane0_tmp1_v.4s
+	sha1p	lane1_abcd_q,lane1_msg_2_s,lane1_tmp1_v.4s
+	sha1p	lane2_abcd_q,lane2_msg_2_s,lane2_tmp1_v.4s
+	sha1p	lane3_abcd_q,lane3_msg_2_s,lane3_tmp1_v.4s
+	add	lane0_tmp1_v.4s,lane0_msg_3_v.4s,key_v.4s
+	add	lane1_tmp1_v.4s,lane1_msg_3_v.4s,key_v.4s
+	add	lane2_tmp1_v.4s,lane2_msg_3_v.4s,key_v.4s
+	add	lane3_tmp1_v.4s,lane3_msg_3_v.4s,key_v.4s
+	sha1su1	lane0_msg_0_v.4s,lane0_msg_3_v.4s
+	sha1su1	lane1_msg_0_v.4s,lane1_msg_3_v.4s
+	sha1su1	lane2_msg_0_v.4s,lane2_msg_3_v.4s
+	sha1su1	lane3_msg_0_v.4s,lane3_msg_3_v.4s
+
+	/* rounds 72-75 */
+	sha1h	lane0_msg_2_s,lane0_abcd_s
+	sha1h	lane1_msg_2_s,lane1_abcd_s
+	sha1h	lane2_msg_2_s,lane2_abcd_s
+	sha1h	lane3_msg_2_s,lane3_abcd_s
+	sha1p	lane0_abcd_q,lane0_msg_1_s,lane0_tmp0_v.4s
+	sha1p	lane1_abcd_q,lane1_msg_1_s,lane1_tmp0_v.4s
+	sha1p	lane2_abcd_q,lane2_msg_1_s,lane2_tmp0_v.4s
+	sha1p	lane3_abcd_q,lane3_msg_1_s,lane3_tmp0_v.4s
+
+	/* rounds 76-79 */
+	sha1h	lane0_msg_1_s,lane0_abcd_s
+	sha1h	lane1_msg_1_s,lane1_abcd_s
+	sha1h	lane2_msg_1_s,lane2_abcd_s
+	sha1h	lane3_msg_1_s,lane3_abcd_s
+	sha1p	lane0_abcd_q,lane0_msg_2_s,lane0_tmp1_v.4s
+	sha1p	lane1_abcd_q,lane1_msg_2_s,lane1_tmp1_v.4s
+	sha1p	lane2_abcd_q,lane2_msg_2_s,lane2_tmp1_v.4s
+	sha1p	lane3_abcd_q,lane3_msg_2_s,lane3_tmp1_v.4s
+	add	digest_adr,digests,block_cnt
+	ld4	{lane0_msg_0_v.S-lane3_msg_0_v.S}[0],[digest_adr],offs
+	ld4	{lane0_msg_0_v.S-lane3_msg_0_v.S}[1],[digest_adr],offs
+	ld4	{lane0_msg_0_v.S-lane3_msg_0_v.S}[2],[digest_adr],offs
+	ld4	{lane0_msg_0_v.S-lane3_msg_0_v.S}[3],[digest_adr],offs
+	ld4	{lane0_msg_3_v.S-lane3_msg_3_v.S}[0],[digest_adr]
+
+	add	lane0_abcd_v.4S,lane0_abcd_v.4S,lane0_msg_0_v.4S
+	add	lane1_abcd_v.4S,lane1_abcd_v.4S,lane1_msg_0_v.4S
+	add	lane2_abcd_v.4S,lane2_abcd_v.4S,lane2_msg_0_v.4S
+	add	lane3_abcd_v.4S,lane3_abcd_v.4S,lane3_msg_0_v.4S
+
+	add	lane0_msg_1_v.4S,lane0_msg_1_v.4S,lane0_msg_3_v.4S
+	add	lane1_msg_1_v.4S,lane1_msg_1_v.4S,lane1_msg_3_v.4S
+	add	lane2_msg_1_v.4S,lane2_msg_1_v.4S,lane2_msg_3_v.4S
+	add	lane3_msg_1_v.4S,lane3_msg_1_v.4S,lane3_msg_3_v.4S
+
+	add	digest_adr,digests,block_cnt
+	st4	{lane0_abcd_v.S-lane3_abcd_v.S}[0],[digest_adr],offs
+	st4	{lane0_abcd_v.S-lane3_abcd_v.S}[1],[digest_adr],offs
+	st4	{lane0_abcd_v.S-lane3_abcd_v.S}[2],[digest_adr],offs
+	st4	{lane0_abcd_v.S-lane3_abcd_v.S}[3],[digest_adr],offs
+	st4	{lane0_msg_1_v.S-lane3_msg_1_v.S}[0],[digest_adr]
+
+	add	block_cnt,block_cnt,16
+	cmp	block_cnt,64
+	add	msg_adr,input_data,block_cnt
+	add	digest_adr,digests,block_cnt
+	bcc	lane_loop
+
+	subs	num_blocks,num_blocks,1
+	add	input_data,input_data,1024
+	bhi	start_loop
+
+	/* save murmur-hash digest */
+	str	mur_hash1, [mur_digest], #8
+	str	mur_hash2, [mur_digest]
+
+exit_func:
+	// restore temp register
+	ldp	d10, d11, [sp, 16]
+	ldp	d12, d13, [sp, 32]
+	ldp	d14, d15, [sp, 48]
+	ldp	x19, x20, [sp, 64]
+	ldp	d8, d9, [sp], 80
+	ret
+
+	.size	mh_sha1_murmur3_block_ce, .-mh_sha1_murmur3_block_ce
+	.section	.rodata.cst16,"aM",@progbits,16
+	.align	4
+KEY_0:
+	.word	0x5a827999
+	.word	0x5a827999
+	.word	0x5a827999
+	.word	0x5a827999
+KEY_1:
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+KEY_2:
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+KEY_3:
+	.word	0xca62c1d6
+	.word	0xca62c1d6
+	.word	0xca62c1d6
+	.word	0xca62c1d6
+
+N1:
+	.word	0x52dce729
+	.word	0x52dce729
+	.word	0x52dce729
+	.word	0x52dce729
+N2:
+	.word	0x38495ab5
+	.word	0x38495ab5
+	.word	0x38495ab5
+	.word	0x38495ab5
+
+C1:
+	.dword	0x87c37b91114253d5
+	.dword	0x87c37b91114253d5
+C2:
+	.dword	0x4cf5ad432745937f
+	.dword	0x4cf5ad432745937f
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_ce.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_ce.c
new file mode 100644
index 000000000..4da674fba
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_ce.c
@@ -0,0 +1,54 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_murmur3_aarch64_internal.h"
+
+extern void mh_sha1_tail_ce(uint8_t * partial_buffer, uint32_t total_len,
+			    uint32_t(*mh_sha1_segs_digests)[HASH_SEGS],
+			    uint8_t * frame_buffer,
+			    uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+extern void mh_sha1_block_ce(const uint8_t * input_data,
+			     uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+			     uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+// mh_sha1_murmur3_update_ce.c
+#define UPDATE_FUNCTION mh_sha1_murmur3_update_ce
+#define BLOCK_FUNCTION	mh_sha1_murmur3_block_ce
+#include "mh_sha1_murmur3_x64_128_update_base.c"
+#undef UPDATE_FUNCTION
+#undef BLOCK_FUNCTION
+
+// mh_sha1_murmur3_finalize_ce.c
+#define FINALIZE_FUNCTION mh_sha1_murmur3_finalize_ce
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_ce
+#include "mh_sha1_murmur3_x64_128_finalize_base.c"
+#undef FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_multibinary.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_multibinary.S
new file mode 100644
index 000000000..051a6157e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_multibinary.S
@@ -0,0 +1,34 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include "aarch64_multibinary.h"
+
+mbin_interface mh_sha1_murmur3_x64_128_update
+mbin_interface mh_sha1_murmur3_x64_128_finalize
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/sha1_asimd_common.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/sha1_asimd_common.S
new file mode 100644
index 000000000..ccc66f41a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/sha1_asimd_common.S
@@ -0,0 +1,271 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+	.arch armv8-a
+
+// macro F = (D ^ (B & (C ^ D)))
+.macro FUNC_F0
+	eor	VF.16b, VC.16b, VD.16b
+	and	VF.16b, VB.16b, VF.16b
+	eor	VF.16b, VD.16b, VF.16b
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F1
+	eor	VF.16b, VB.16b, VC.16b
+	eor	VF.16b, VF.16b, VD.16b
+.endm
+
+// F = ((B & C) | (B & D) | (C & D))
+.macro FUNC_F2
+	and	vT0.16b, VB.16b, VC.16b
+	and	vT1.16b, VB.16b, VD.16b
+	and	vT2.16b, VC.16b, VD.16b
+	orr	VF.16b, vT0.16b, vT1.16b
+	orr	VF.16b, VF.16b, vT2.16b
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F3
+	FUNC_F1
+.endm
+
+.altmacro
+.macro load_next_word windex
+	.if \windex < 16
+		load_x4_word	\windex
+	.endif
+.endm
+
+// FUNC_F0 is merged into STEP_00_15 for efficiency
+.macro SHA1_STEP_00_15_F0 windex:req
+	rev32	WORD\windex\().16b,WORD\windex\().16b
+	next_word=\windex+1
+	load_next_word %next_word
+	// e = (a leftrotate 5) + f + e + k + w[i]
+	ushr	VT.4s, VA.4s, 32 - 5
+	add	VE.4s, VE.4s, VK.4s
+	sli	VT.4s, VA.4s, 5
+	eor	VF.16b, VC.16b, VD.16b
+	add	VE.4s, VE.4s, WORD\windex\().4s
+	and	VF.16b, VB.16b, VF.16b
+	add	VE.4s, VE.4s, VT.4s
+	eor	VF.16b, VD.16b, VF.16b
+	ushr	VT.4s, VB.4s, 32 - 30
+	add	VE.4s, VE.4s, VF.4s
+	sli	VT.4s, VB.4s, 30
+.endm
+
+.macro SHA1_STEP_16_79 windex:req,func_f:req,reg_3:req,reg_8:req,reg_14:req,reg_16:req
+	eor	vT0.16b,\reg_3\().16b,\reg_8\().16b
+	eor	VT.16b,\reg_14\().16b,\reg_16\().16b
+	sha1_step_16_79_interleave0	\windex
+	eor	vT0.16b,vT0.16b,VT.16b
+	sha1_step_16_79_interleave1	\windex
+	// e = (a leftrotate 5) + f + e + k + w[i]
+	ushr	VT.4s, vT0.4s, 32 - 1
+	add	VE.4s, VE.4s, VK.4s
+	ushr	vT1.4s, VA.4s, 32 - 5
+	sli	VT.4s, vT0.4s, 1
+	add	VE.4s, VE.4s, VT.4s
+	sli	vT1.4s, VA.4s, 5
+	mov	\reg_16\().16b,VT.16b
+	add	VE.4s, VE.4s, vT1.4s
+	ushr	VT.4s, VB.4s, 32 - 30
+	\func_f
+	add	VE.4s, VE.4s, VF.4s
+	sli	VT.4s, VB.4s, 30
+.endm
+
+	VA	.req v0
+	VB	.req v1
+	VC	.req v2
+	VD	.req v3
+	VE	.req v4
+	VT	.req v5
+	VF	.req v6
+	VK	.req v7
+	WORD0	.req v8
+	WORD1	.req v9
+	WORD2	.req v10
+	WORD3	.req v11
+	WORD4	.req v12
+	WORD5	.req v13
+	WORD6	.req v14
+	WORD7	.req v15
+	WORD8	.req v16
+	WORD9	.req v17
+	WORD10	.req v18
+	WORD11	.req v19
+	WORD12	.req v20
+	WORD13	.req v21
+	WORD14	.req v22
+	WORD15	.req v23
+	vT0	.req v24
+	vT1	.req v25
+	vT2	.req v26
+	vAA	.req v27
+	vBB	.req v28
+	vCC	.req v29
+	vDD	.req v30
+	vEE	.req v31
+	TT	.req v0
+	sha1key_adr	.req	x15
+
+.macro SWAP_STATES
+	// shifted VB is held in VT after each step
+	.unreq TT
+	TT .req VE
+	.unreq VE
+	VE .req VD
+	.unreq VD
+	VD .req VC
+	.unreq VC
+	VC .req VT
+	.unreq	VT
+	VT .req VB
+	.unreq VB
+	VB .req VA
+	.unreq VA
+	VA .req TT
+.endm
+
+.altmacro
+.macro SHA1_STEP_16_79_WRAPPER windex:req,func_f:req,idx3:req,idx8:req,idx14:req,idx16:req
+	SHA1_STEP_16_79 \windex,\func_f,WORD\idx3\(),WORD\idx8\(),WORD\idx14\(),WORD\idx16\()
+.endm
+
+.macro exec_step windex:req
+	.if \windex <= 15
+		SHA1_STEP_00_15_F0	windex
+	.else
+		idx14=((\windex - 14) & 15)
+		idx8=((\windex - 8) & 15)
+		idx3=((\windex - 3) & 15)
+		idx16=(\windex & 15)
+		.if \windex <= 19
+			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F0,%idx3,%idx8,%idx14,%idx16
+		.endif
+		.if \windex >= 20 && \windex <= 39
+			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F1,%idx3,%idx8,%idx14,%idx16
+		.endif
+		.if \windex >= 40 && \windex <= 59
+			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F2,%idx3,%idx8,%idx14,%idx16
+		.endif
+		.if \windex >= 60 && \windex <= 79
+			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F3,%idx3,%idx8,%idx14,%idx16
+		.endif
+	.endif
+
+	SWAP_STATES
+
+	.if \windex == 79
+		// after 80 steps, the registers ABCDET has shifted from
+		// its orignal order of 012345 to 341520
+		// have to swap back for both compile- and run-time correctness
+		mov	v0.16b,v3.16b
+		.unreq VA
+		VA	.req v0
+
+		mov	vT0.16b,v2.16b
+		mov	v2.16b,v1.16b
+		mov	v1.16b,v4.16b
+		.unreq VB
+		VB	.req v1
+		.unreq VC
+		VC	.req v2
+
+		mov	v3.16b,v5.16b
+		.unreq VD
+		VD	.req v3
+
+		mov	v4.16b,vT0.16b
+		.unreq VE
+		VE	.req v4
+
+		.unreq VT
+		VT	.req v5
+	.endif
+.endm
+
+.macro exec_steps idx:req,more:vararg
+	exec_step	\idx
+	.ifnb \more
+		exec_steps	\more
+	.endif
+.endm
+
+.macro sha1_single
+	load_x4_word 0
+
+	mov	vAA.16B, VA.16B
+	mov	vBB.16B, VB.16B
+	mov	vCC.16B, VC.16B
+	mov	vDD.16B, VD.16B
+	mov	vEE.16B, VE.16B
+
+	adr	sha1key_adr, KEY_0
+	ld1	{VK.4s}, [sha1key_adr]
+	exec_steps	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
+
+	// 20 ~ 39
+	adr	sha1key_adr, KEY_1
+	ld1	{VK.4s}, [sha1key_adr]
+	exec_steps	20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
+
+	// 40 ~ 59
+	adr	sha1key_adr, KEY_2
+	ld1	{VK.4s}, [sha1key_adr]
+	exec_steps	40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
+
+	// 60 ~ 79
+	adr	sha1key_adr, KEY_3
+	ld1	{VK.4s}, [sha1key_adr]
+	exec_steps	60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79
+
+	add	VA.4s, vAA.4s, VA.4s
+	add	VB.4s, vBB.4s, VB.4s
+	add	VC.4s, vCC.4s, VC.4s
+	add	VD.4s, vDD.4s, VD.4s
+	add	VE.4s, vEE.4s, VE.4s
+.endm
+
+.macro sha1_asimd_save_stack
+	stp	d8,d9,[sp, -64]!
+	stp	d10,d11,[sp, 16]
+	stp	d12,d13,[sp, 32]
+	stp	d14,d15,[sp, 48]
+.endm
+
+.macro sha1_asimd_restore_stack
+	ldp	d10,d11,[sp, 16]
+	ldp	d12,d13,[sp, 32]
+	ldp	d14,d15,[sp, 48]
+	ldp	d8,d9,[sp],64
+.endm
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c
new file mode 100644
index 000000000..518adb797
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c
@@ -0,0 +1,154 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_murmur3_x64_128_internal.h"
+
+int mh_sha1_murmur3_x64_128_init(struct mh_sha1_murmur3_x64_128_ctx *ctx, uint64_t murmur_seed)
+{
+	uint64_t *murmur3_x64_128_hash;
+	uint32_t(*mh_sha1_segs_digests)[HASH_SEGS];
+	uint32_t i;
+
+	if (ctx == NULL)
+		return MH_SHA1_MURMUR3_CTX_ERROR_NULL;
+
+	memset(ctx, 0, sizeof(*ctx));
+
+	mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests;
+	for (i = 0; i < HASH_SEGS; i++) {
+		mh_sha1_segs_digests[0][i] = MH_SHA1_H0;
+		mh_sha1_segs_digests[1][i] = MH_SHA1_H1;
+		mh_sha1_segs_digests[2][i] = MH_SHA1_H2;
+		mh_sha1_segs_digests[3][i] = MH_SHA1_H3;
+		mh_sha1_segs_digests[4][i] = MH_SHA1_H4;
+	}
+
+	murmur3_x64_128_hash = (uint64_t *) ctx->murmur3_x64_128_digest;
+	murmur3_x64_128_hash[0] = murmur_seed;
+	murmur3_x64_128_hash[1] = murmur_seed;
+
+	return MH_SHA1_MURMUR3_CTX_ERROR_NONE;
+}
+
+void mh_sha1_murmur3_x64_128_block_base(const uint8_t * input_data,
+					uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+					uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+					uint32_t
+					murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+					uint32_t num_blocks)
+{
+
+	mh_sha1_block_base(input_data, mh_sha1_digests, frame_buffer, num_blocks);
+
+	murmur3_x64_128_block(input_data,
+			      num_blocks * MH_SHA1_BLOCK_SIZE / MUR_BLOCK_SIZE,
+			      murmur3_x64_128_digests);
+
+	return;
+}
+
+#if (!defined(NOARCH)) && (defined(__i386__) || defined(__x86_64__) \
+	|| defined( _M_X64) || defined(_M_IX86))
+/***************mh_sha1_murmur3_x64_128_update***********/
+// mh_sha1_murmur3_x64_128_update_sse.c
+#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_sse
+#define BLOCK_FUNCTION	mh_sha1_murmur3_x64_128_block_sse
+#include "mh_sha1_murmur3_x64_128_update_base.c"
+#undef UPDATE_FUNCTION
+#undef BLOCK_FUNCTION
+
+// mh_sha1_murmur3_x64_128_update_avx.c
+#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_avx
+#define BLOCK_FUNCTION	mh_sha1_murmur3_x64_128_block_avx
+#include "mh_sha1_murmur3_x64_128_update_base.c"
+#undef UPDATE_FUNCTION
+#undef BLOCK_FUNCTION
+
+// mh_sha1_murmur3_x64_128_update_avx2.c
+#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_avx2
+#define BLOCK_FUNCTION	mh_sha1_murmur3_x64_128_block_avx2
+#include "mh_sha1_murmur3_x64_128_update_base.c"
+#undef UPDATE_FUNCTION
+#undef BLOCK_FUNCTION
+
+/***************mh_sha1_murmur3_x64_128_finalize***********/
+// mh_sha1_murmur3_x64_128_finalize_sse.c
+#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_sse
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_sse
+#include "mh_sha1_murmur3_x64_128_finalize_base.c"
+#undef FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+
+// mh_sha1_murmur3_x64_128_finalize_avx.c
+#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_avx
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx
+#include "mh_sha1_murmur3_x64_128_finalize_base.c"
+#undef FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+
+// mh_sha1_murmur3_x64_128_finalize_avx2.c
+#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_avx2
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx2
+#include "mh_sha1_murmur3_x64_128_finalize_base.c"
+#undef FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+
+/***************version info***********/
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+
+// Version info
+struct slver mh_sha1_murmur3_x64_128_init_slver_00000251;
+struct slver mh_sha1_murmur3_x64_128_init_slver = { 0x0251, 0x00, 0x00 };
+
+// mh_sha1_murmur3_x64_128_update version info
+struct slver mh_sha1_murmur3_x64_128_update_sse_slver_00000254;
+struct slver mh_sha1_murmur3_x64_128_update_sse_slver = { 0x0254, 0x00, 0x00 };
+
+struct slver mh_sha1_murmur3_x64_128_update_avx_slver_02000256;
+struct slver mh_sha1_murmur3_x64_128_update_avx_slver = { 0x0256, 0x00, 0x02 };
+
+struct slver mh_sha1_murmur3_x64_128_update_avx2_slver_04000258;
+struct slver mh_sha1_murmur3_x64_128_update_avx2_slver = { 0x0258, 0x00, 0x04 };
+
+// mh_sha1_murmur3_x64_128_finalize version info
+struct slver mh_sha1_murmur3_x64_128_finalize_sse_slver_00000255;
+struct slver mh_sha1_murmur3_x64_128_finalize_sse_slver = { 0x0255, 0x00, 0x00 };
+
+struct slver mh_sha1_murmur3_x64_128_finalize_avx_slver_02000257;
+struct slver mh_sha1_murmur3_x64_128_finalize_avx_slver = { 0x0257, 0x00, 0x02 };
+
+struct slver mh_sha1_murmur3_x64_128_finalize_avx2_slver_04000259;
+struct slver mh_sha1_murmur3_x64_128_finalize_avx2_slver = { 0x0259, 0x00, 0x04 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c
new file mode 100644
index 000000000..fbef1ac13
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c
@@ -0,0 +1,67 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_murmur3_x64_128_internal.h"
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+/***************mh_sha1_murmur3_x64_128_update***********/
+// mh_sha1_murmur3_x64_128_update_avx512.c
+#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_avx512
+#define BLOCK_FUNCTION	mh_sha1_murmur3_x64_128_block_avx512
+#include "mh_sha1_murmur3_x64_128_update_base.c"
+#undef UPDATE_FUNCTION
+#undef BLOCK_FUNCTION
+
+/***************mh_sha1_murmur3_x64_128_finalize***********/
+// mh_sha1_murmur3_x64_128_finalize_avx512.c
+#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_avx512
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx512
+#include "mh_sha1_murmur3_x64_128_finalize_base.c"
+#undef FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+
+/***************version info***********/
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+
+// mh_sha1_murmur3_x64_128_update version info
+struct slver mh_sha1_murmur3_x64_128_update_avx512_slver_0600025c;
+struct slver mh_sha1_murmur3_x64_128_update_avx512_slver = { 0x025c, 0x00, 0x06 };
+
+// mh_sha1_murmur3_x64_128_finalize version info
+struct slver mh_sha1_murmur3_x64_128_finalize_avx512_slver_0600025d;
+struct slver mh_sha1_murmur3_x64_128_finalize_avx512_slver = { 0x025d, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_base_aliases.c
new file mode 100644
index 000000000..28f15086d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_base_aliases.c
@@ -0,0 +1,43 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "mh_sha1_murmur3_x64_128_internal.h"
+#include <string.h>
+int mh_sha1_murmur3_x64_128_update(struct mh_sha1_murmur3_x64_128_ctx *ctx, const void *buffer,
+				   uint32_t len)
+{
+	return mh_sha1_murmur3_x64_128_update_base(ctx, buffer, len);
+
+}
+
+int mh_sha1_murmur3_x64_128_finalize(struct mh_sha1_murmur3_x64_128_ctx *ctx,
+				     void *mh_sha1_digest, void *murmur3_x64_128_digest)
+{
+	return mh_sha1_murmur3_x64_128_finalize_base(ctx, mh_sha1_digest,
+						     murmur3_x64_128_digest);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm
new file mode 100644
index 000000000..4611494e0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm
@@ -0,0 +1,706 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using AVX
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T   ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    vpxor  %%regF, %%regC,%%regD
+    vpand  %%regF, %%regF,%%regB
+    vpxor  %%regF, %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T   ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    vpxor  %%regF,%%regD,%%regC
+    vpxor  %%regF,%%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T   ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    vpor   %%regF,%%regB,%%regC
+    vpand  %%regT,%%regB,%%regC
+    vpand  %%regF,%%regF,%%regD
+    vpor   %%regF,%%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T   ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpsrld  %%tmp, %%reg, (32-(%%imm))
+	vpslld  %%reg, %%reg, %%imm
+	vpor    %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PROLD_nd reg, imm, tmp, src
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	vpsrld  %%tmp, %%src, (32-(%%imm))
+	vpslld  %%reg, %%src, %%imm
+	vpor    %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 11
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+	vpaddd  %%regE, %%regE,%%immCNT
+	vpaddd  %%regE, %%regE,[%%data + (%%memW * 16)]
+	PROLD_nd        %%regT,5, %%regF,%%regA
+	vpaddd  %%regE, %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD   %%regB,30, %%regT
+	vpaddd  %%regE, %%regE,%%regF
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro SHA1_STEP_16_79 11
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+	vpaddd  %%regE, %%regE,%%immCNT
+
+	vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+	vpxor   W16, W16, W14
+	vpxor   W16, W16, [%%data + ((%%memW -  8) & 15) * 16]
+	vpxor   W16, W16, [%%data + ((%%memW -  3) & 15) * 16]
+
+	vpsrld  %%regF, W16, (32-1)
+	vpslld  W16, W16, 1
+	vpor    %%regF, %%regF, W16
+	ROTATE_W
+
+	vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+	vpaddd  %%regE, %%regE,%%regF
+
+	PROLD_nd        %%regT,5, %%regF, %%regA
+	vpaddd  %%regE, %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD   %%regB,30, %%regT
+	vpaddd  %%regE,%%regE,%%regF
+%endmacro
+
+;; Insert murmur's instructions into this macro.
+;; Every section_loop of mh_sha1 calls SHA1_STEP_16_79 64 times and processes 256Byte.
+;; So insert 1 murmur block into every 4 SHA1_STEP_16_79.
+%define SHA1_STEP_16_79(J) SHA1_STEP_16_79_ %+ J
+
+%macro SHA1_STEP_16_79_0 11
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+	vpaddd  %%regE, %%regE,%%immCNT
+
+	vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+	vpxor   W16, W16, W14
+	vpxor   W16, W16, [%%data + ((%%memW -  8) & 15) * 16]
+	vpxor   W16, W16, [%%data + ((%%memW -  3) & 15) * 16]
+
+	vpsrld  %%regF, W16, (32-1)
+	mov	mur_data1, [mur_in_p]
+	mov	mur_data2, [mur_in_p + 8]
+	vpslld  W16, W16, 1
+	vpor    %%regF, %%regF, W16
+	ROTATE_W
+
+	vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+	imul	mur_data1, mur_c1_r
+	vpaddd  %%regE, %%regE,%%regF
+
+	PROLD_nd        %%regT,5, %%regF, %%regA
+	vpaddd  %%regE, %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	imul	mur_data2, mur_c2_r
+	PROLD   %%regB,30, %%regT
+	vpaddd  %%regE,%%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79_1 11
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+	vpaddd  %%regE, %%regE,%%immCNT
+	rol	mur_data1, R1
+	vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+	vpxor   W16, W16, W14
+	vpxor   W16, W16, [%%data + ((%%memW -  8) & 15) * 16]
+	vpxor   W16, W16, [%%data + ((%%memW -  3) & 15) * 16]
+
+	vpsrld  %%regF, W16, (32-1)
+	vpslld  W16, W16, 1
+	rol	mur_data2, R2
+	vpor    %%regF, %%regF, W16
+	ROTATE_W
+
+	vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+	imul	mur_data1, mur_c2_r
+	vpaddd  %%regE, %%regE,%%regF
+
+	PROLD_nd        %%regT,5, %%regF, %%regA
+	vpaddd  %%regE, %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	imul	mur_data2, mur_c1_r
+	PROLD   %%regB,30, %%regT
+	add	mur_in_p, 16
+	vpaddd  %%regE,%%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79_2 11
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+	vpaddd  %%regE, %%regE,%%immCNT
+
+	vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+	xor	mur_hash1, mur_data1
+	vpxor   W16, W16, W14
+	vpxor   W16, W16, [%%data + ((%%memW -  8) & 15) * 16]
+	vpxor   W16, W16, [%%data + ((%%memW -  3) & 15) * 16]
+	rol	mur_hash1, R3
+	vpsrld  %%regF, W16, (32-1)
+	vpslld  W16, W16, 1
+	vpor    %%regF, %%regF, W16
+	ROTATE_W
+
+	vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+	add	mur_hash1, mur_hash2
+	vpaddd  %%regE, %%regE,%%regF
+
+	PROLD_nd        %%regT,5, %%regF, %%regA
+	lea	mur_hash1, [mur_hash1 + mur_hash1*4 + N1]
+	vpaddd  %%regE, %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD   %%regB,30, %%regT
+	vpaddd  %%regE,%%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79_3 11
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+	vpaddd  %%regE, %%regE,%%immCNT
+
+	vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+	xor	mur_hash2, mur_data2
+	vpxor   W16, W16, W14
+	vpxor   W16, W16, [%%data + ((%%memW -  8) & 15) * 16]
+	vpxor   W16, W16, [%%data + ((%%memW -  3) & 15) * 16]
+	rol	mur_hash2, R4
+	vpsrld  %%regF, W16, (32-1)
+	vpslld  W16, W16, 1
+	vpor    %%regF, %%regF, W16
+	ROTATE_W
+
+	vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+	add	mur_hash2, mur_hash1
+	vpaddd  %%regE, %%regE,%%regF
+
+	PROLD_nd        %%regT,5, %%regF, %%regA
+	vpaddd  %%regE, %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD   %%regB,30, %%regT
+	lea	mur_hash2, [mur_hash2 + mur_hash2*4 + N2]
+	vpaddd  %%regE,%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+
+ %define arg4  r8d
+ %define arg5  r9
+
+ %define tmp1  r10
+ %define tmp2  r11
+ %define tmp3  r12		; must be saved and restored
+ %define tmp4  r13		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define tmp7  rbx		; must be saved and restored
+ %define tmp8  rbp		; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	push	rbx
+	push	rbp
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	rbp
+	pop	rbx
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r10d
+ %define arg5   r11
+ %define tmp1   r12		; must be saved and restored
+ %define tmp2   r13		; must be saved and restored
+ %define tmp3   r14		; must be saved and restored
+ %define tmp4   r15		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define tmp7   rbx		; must be saved and restored
+ %define tmp8   rbp		; must be saved and restored
+ %define return rax
+
+ %define stack_size  10*16 + 9*8		; must be an odd multiple of 8
+ %define PS 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	save_reg	rsi,  10*16 + 5*8
+	save_reg	rbx,  10*16 + 6*8
+	save_reg	rbp,  10*16 + 7*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	mov	rsi,  [rsp + 10*16 + 5*8]
+ 	mov	rbx,  [rsp + 10*16 + 6*8]
+	mov	rbp,  [rsp + 10*16 + 7*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops 		arg4
+;variables of mh_sha1
+%define mh_in_p  	arg0
+%define mh_digests_p 	arg1
+%define mh_data_p	arg2
+%define mh_segs  	tmp1
+;variables of murmur3
+%define mur_in_p  	tmp2
+%define mur_digest_p 	arg3
+%define mur_hash1	 tmp3
+%define mur_hash2	 tmp4
+%define mur_data1	 tmp5
+%define mur_data2	 return
+%define mur_c1_r	 tmp6
+%define mur_c2_r	 arg5
+; constants of murmur3_x64_128
+%define R1	31
+%define R2	33
+%define R3	27
+%define R4	31
+%define M	5
+%define N1	0x52dce729;DWORD
+%define N2	0x38495ab5;DWORD
+%define C1	QWORD(0x87c37b91114253d5)
+%define C2	QWORD(0x4cf5ad432745937f)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;variables used by storing segs_digests on stack
+%define RSP_SAVE	tmp7
+%define FRAMESZ 	4*5*16		;BYTES*DWORDS*SEGS
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS  vmovups
+
+%define A       xmm0
+%define B       xmm1
+%define C       xmm2
+%define D       xmm3
+%define E       xmm4
+%define F       xmm5 ; tmp
+%define G       xmm6 ; tmp
+
+%define TMP     G
+%define FUN     F
+%define K       xmm7
+
+%define AA      xmm8
+%define BB      xmm9
+%define CC      xmm10
+%define DD      xmm11
+%define EE      xmm12
+
+%define T0      xmm6
+%define T1      xmm7
+%define T2      xmm8
+%define T3      xmm9
+%define T4      xmm10
+%define T5      xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14     xmm13
+%define W15     xmm14
+%define W16     xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a  | b  |  c | ...|  p | (16)
+; h0 | h0 | h0 | ...| h0 |    | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 |    | Ba| Bb | Bc |...| Bp |
+; ....
+; h5 | h5 | h5 | ...| h5 |    | Ea| Eb | Ec |...| Ep |
+
+align 32
+;void mh_sha1_murmur3_x64_128_block_avx (const uint8_t * input_data,
+;				uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+;				uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+;				uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+;				uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 pointer to murmur3 digest
+; arg 4 number  of 1KB blocks
+;
+mk_global mh_sha1_murmur3_x64_128_block_avx, function, internal
+func(mh_sha1_murmur3_x64_128_block_avx)
+	endbranch
+	FUNC_SAVE
+	; save rsp
+	mov	RSP_SAVE, rsp
+
+	cmp	loops, 0
+	jle	.return
+
+	; leave enough space to store segs_digests
+	sub     rsp, FRAMESZ
+	; align rsp to 16 Bytes needed by avx
+	and	rsp, ~0x0F
+
+ %assign I 0					; copy segs_digests into stack
+ %rep 5
+	VMOVPS  A, [mh_digests_p + I*64 + 16*0]
+	VMOVPS  B, [mh_digests_p + I*64 + 16*1]
+	VMOVPS  C, [mh_digests_p + I*64 + 16*2]
+	VMOVPS  D, [mh_digests_p + I*64 + 16*3]
+
+	vmovdqa [rsp + I*64 + 16*0], A
+	vmovdqa [rsp + I*64 + 16*1], B
+	vmovdqa [rsp + I*64 + 16*2], C
+	vmovdqa [rsp + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+
+ 	;init murmur variables
+	mov	mur_in_p, mh_in_p	;different steps between murmur and mh_sha1
+	;load murmur hash digests and multiplier
+	mov	mur_hash1, [mur_digest_p]
+	mov	mur_hash2, [mur_digest_p + 8]
+	mov	mur_c1_r,  C1
+	mov	mur_c2_r,  C2
+
+.block_loop:
+	;transform to big-endian data and store on aligned_frame
+	vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+	;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4
+ %assign I 0
+ %rep 16
+	VMOVPS   T0,[mh_in_p + I*64+0*16]
+	VMOVPS   T1,[mh_in_p + I*64+1*16]
+	VMOVPS   T2,[mh_in_p + I*64+2*16]
+	VMOVPS   T3,[mh_in_p + I*64+3*16]
+
+	vpshufb  T0, F
+	vmovdqa  [mh_data_p +(I)*16 +0*256],T0
+	vpshufb  T1, F
+	vmovdqa  [mh_data_p +(I)*16 +1*256],T1
+	vpshufb  T2, F
+	vmovdqa  [mh_data_p +(I)*16 +2*256],T2
+	vpshufb  T3, F
+	vmovdqa  [mh_data_p +(I)*16 +3*256],T3
+ %assign I (I+1)
+ %endrep
+
+ 	mov	mh_segs, 0			;start from the first 4 segments
+ .segs_loop:
+	;; Initialize digests
+	vmovdqa  A, [rsp + 0*64 + mh_segs]
+	vmovdqa  B, [rsp + 1*64 + mh_segs]
+	vmovdqa  C, [rsp + 2*64 + mh_segs]
+	vmovdqa  D, [rsp + 3*64 + mh_segs]
+	vmovdqa  E, [rsp + 4*64 + mh_segs]
+
+	vmovdqa  AA, A
+	vmovdqa  BB, B
+	vmovdqa  CC, C
+	vmovdqa  DD, D
+	vmovdqa  EE, E
+;;
+;; perform 0-79 steps
+;;
+	vmovdqa K, [K00_19]
+;; do rounds 0...15
+ %assign I 0
+ %rep 16
+	SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 16...19
+	vmovdqa W16, [mh_data_p + ((16 - 16) & 15) * 16]
+	vmovdqa W15, [mh_data_p + ((16 - 15) & 15) * 16]
+ %rep 4
+ %assign J (I % 4)
+	SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 20...39
+	vmovdqa K, [K20_39]
+ %rep 20
+ %assign J (I % 4)
+	SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 40...59
+	vmovdqa K, [K40_59]
+ %rep 20
+ %assign J (I % 4)
+	SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 60...79
+	vmovdqa K, [K60_79]
+ %rep 20
+ %assign J (I % 4)
+	SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+	vpaddd  A, AA
+	vpaddd  B, BB
+	vpaddd  C, CC
+	vpaddd  D, DD
+	vpaddd  E, EE
+
+	; write out digests
+	vmovdqa  [rsp + 0*64 + mh_segs], A
+	vmovdqa  [rsp + 1*64 + mh_segs], B
+	vmovdqa  [rsp + 2*64 + mh_segs], C
+	vmovdqa  [rsp + 3*64 + mh_segs], D
+	vmovdqa  [rsp + 4*64 + mh_segs], E
+
+	add	mh_data_p,	256
+	add 	mh_segs, 16
+	cmp	mh_segs, 64
+	jc 	.segs_loop
+
+	sub	mh_data_p, (1024)
+	add 	mh_in_p,   (1024)
+	sub     loops, 1
+	jne     .block_loop
+
+	;store murmur-hash digest
+	mov	[mur_digest_p], mur_hash1
+	mov	[mur_digest_p + 8], mur_hash2
+
+ %assign I 0					; copy segs_digests back to mh_digests_p
+ %rep 5
+	vmovdqa A, [rsp + I*64 + 16*0]
+	vmovdqa B, [rsp + I*64 + 16*1]
+	vmovdqa C, [rsp + I*64 + 16*2]
+	vmovdqa D, [rsp + I*64 + 16*3]
+
+	VMOVPS  [mh_digests_p + I*64 + 16*0], A
+	VMOVPS  [mh_digests_p + I*64 + 16*1], B
+	VMOVPS  [mh_digests_p + I*64 + 16*2], C
+	VMOVPS  [mh_digests_p + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+	mov	rsp, RSP_SAVE			; restore rsp
+
+.return:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+K00_19:                  dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39:                  dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59:                  dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79:                  dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm
new file mode 100644
index 000000000..3fb440bf1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm
@@ -0,0 +1,653 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using AVX2
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; Magic functions defined in FIPS 180-1
+;;
+;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    vpxor  %%regF, %%regC,%%regD
+    vpand  %%regF, %%regF,%%regB
+    vpxor  %%regF, %%regF,%%regD
+%endmacro
+
+;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    vpxor  %%regF,%%regD,%%regC
+    vpxor  %%regF,%%regF,%%regB
+%endmacro
+
+
+
+;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    vpor   %%regF,%%regB,%%regC
+    vpand  %%regT,%%regB,%%regC
+    vpand  %%regF,%%regF,%%regD
+    vpor   %%regF,%%regF,%%regT
+%endmacro
+
+;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpsrld	%%tmp, %%reg, (32-%%imm)
+	vpslld	%%reg, %%reg, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	vpsrld	%%tmp, %%src, (32-%%imm)
+	vpslld	%%reg, %%src, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 11
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+	vpaddd	%%regE, %%regE,%%immCNT
+	vpaddd	%%regE, %%regE,[%%data + (%%memW * 32)]
+	PROLD_nd	%%regT,5, %%regF,%%regA
+	vpaddd	%%regE, %%regE,%%regT
+	%%MAGIC	%%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD	%%regB,30, %%regT
+	vpaddd	%%regE, %%regE,%%regF
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro SHA1_STEP_16_79 11
+%define %%regA	%1
+%define %%regB	%2
+%define %%regC	%3
+%define %%regD	%4
+%define %%regE	%5
+%define %%regT	%6
+%define %%regF	%7
+%define %%memW	%8
+%define %%immCNT %9
+%define %%MAGIC	%10
+%define %%data %11
+	vpaddd	%%regE, %%regE,%%immCNT
+
+	vmovdqa	W14, [%%data + ((%%memW - 14) & 15) * 32]
+	vpxor	W16, W16, W14
+	vpxor	W16, W16, [%%data + ((%%memW -  8) & 15) * 32]
+	vpxor	W16, W16, [%%data + ((%%memW -  3) & 15) * 32]
+
+	vpsrld	%%regF, W16, (32-1)
+	vpslld	W16, W16, 1
+	vpor	%%regF, %%regF, W16
+	ROTATE_W
+
+	vmovdqa	[%%data + ((%%memW - 0) & 15) * 32],%%regF
+	vpaddd	%%regE, %%regE,%%regF
+
+	PROLD_nd	%%regT,5, %%regF, %%regA
+	vpaddd	%%regE, %%regE,%%regT
+	%%MAGIC	%%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD	%%regB,30, %%regT
+	vpaddd	%%regE,%%regE,%%regF
+%endmacro
+
+;; Insert murmur's instructions into this macro.
+;; Every section_loop of mh_sha1 calls SHA1_STEP_16_79 64 times and processes 512Byte.
+;; So insert 1 murmur block into every 2 SHA1_STEP_16_79.
+%define SHA1_STEP_16_79(J) SHA1_STEP_16_79_ %+ J
+
+%macro SHA1_STEP_16_79_0 11
+%define %%regA	%1
+%define %%regB	%2
+%define %%regC	%3
+%define %%regD	%4
+%define %%regE	%5
+%define %%regT	%6
+%define %%regF	%7
+%define %%memW	%8
+%define %%immCNT %9
+%define %%MAGIC	%10
+%define %%data %11
+	vpaddd	%%regE, %%regE,%%immCNT
+
+	vmovdqa	W14, [%%data + ((%%memW - 14) & 15) * 32]
+	vpxor	W16, W16, W14
+	vpxor	W16, W16, [%%data + ((%%memW -  8) & 15) * 32]
+	vpxor	W16, W16, [%%data + ((%%memW -  3) & 15) * 32]
+	mov	mur_data1, [mur_in_p]
+	mov	mur_data2, [mur_in_p + 8]
+
+	vpsrld	%%regF, W16, (32-1)
+	imul	mur_data1, mur_c1_r
+	vpslld	W16, W16, 1
+	vpor	%%regF, %%regF, W16
+	imul	mur_data2, mur_c2_r
+	ROTATE_W
+
+	vmovdqa	[%%data + ((%%memW - 0) & 15) * 32],%%regF
+	rol	mur_data1, R1
+	vpaddd	%%regE, %%regE,%%regF
+	rol	mur_data2, R2
+	PROLD_nd	%%regT,5, %%regF, %%regA
+	vpaddd	%%regE, %%regE,%%regT
+	imul	mur_data1, mur_c2_r
+	%%MAGIC	%%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD	%%regB,30, %%regT
+	imul	mur_data2, mur_c1_r
+	vpaddd	%%regE,%%regE,%%regF
+%endmacro
+
+
+%macro SHA1_STEP_16_79_1 11
+%define %%regA	%1
+%define %%regB	%2
+%define %%regC	%3
+%define %%regD	%4
+%define %%regE	%5
+%define %%regT	%6
+%define %%regF	%7
+%define %%memW	%8
+%define %%immCNT %9
+%define %%MAGIC	%10
+%define %%data %11
+	vpaddd	%%regE, %%regE,%%immCNT
+	xor	mur_hash1, mur_data1
+	vmovdqa	W14, [%%data + ((%%memW - 14) & 15) * 32]
+	rol	mur_hash1, R3
+	vpxor	W16, W16, W14
+	add	mur_hash1, mur_hash2
+	vpxor	W16, W16, [%%data + ((%%memW -  8) & 15) * 32]
+	vpxor	W16, W16, [%%data + ((%%memW -  3) & 15) * 32]
+	lea	mur_hash1, [mur_hash1 + mur_hash1*4 + N1]
+	vpsrld	%%regF, W16, (32-1)
+	vpslld	W16, W16, 1
+	xor	mur_hash2, mur_data2
+	vpor	%%regF, %%regF, W16
+	rol	mur_hash2, R4
+	ROTATE_W
+
+	vmovdqa	[%%data + ((%%memW - 0) & 15) * 32],%%regF
+	vpaddd	%%regE, %%regE,%%regF
+	add	mur_hash2, mur_hash1
+	PROLD_nd	%%regT,5, %%regF, %%regA
+	vpaddd	%%regE, %%regE,%%regT
+	lea	mur_hash2, [mur_hash2 + mur_hash2*4 + N2]
+	%%MAGIC	%%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD	%%regB,30, %%regT
+	add	mur_in_p, 16
+	vpaddd	%%regE,%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+
+ %define arg4  r8d
+ %define arg5  r9
+
+ %define tmp1  r10
+ %define tmp2  r11
+ %define tmp3  r12		; must be saved and restored
+ %define tmp4  r13		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define tmp7  rbx		; must be saved and restored
+ %define tmp8  rbp		; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	push	rbx
+	push	rbp
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	rbp
+	pop	rbx
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r10d
+ %define arg5   r11
+ %define tmp1   r12		; must be saved and restored
+ %define tmp2   r13		; must be saved and restored
+ %define tmp3   r14		; must be saved and restored
+ %define tmp4   r15		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define tmp7   rbx		; must be saved and restored
+ %define tmp8   rbp		; must be saved and restored
+ %define return rax
+
+ %define stack_size  10*16 + 9*8		; must be an odd multiple of 8
+ %define PS 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	save_reg	rsi,  10*16 + 5*8
+	save_reg	rbx,  10*16 + 6*8
+	save_reg	rbp,  10*16 + 7*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	mov	rsi,  [rsp + 10*16 + 5*8]
+ 	mov	rbx,  [rsp + 10*16 + 6*8]
+	mov	rbp,  [rsp + 10*16 + 7*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops 		arg4
+;variables of mh_sha1
+%define mh_in_p  	arg0
+%define mh_digests_p 	arg1
+%define mh_data_p	arg2
+%define mh_segs  	tmp1
+;variables of murmur3
+%define mur_in_p  	tmp2
+%define mur_digest_p 	arg3
+%define mur_hash1	 tmp3
+%define mur_hash2	 tmp4
+%define mur_data1	 tmp5
+%define mur_data2	 return
+%define mur_c1_r	 tmp6
+%define mur_c2_r	 arg5
+; constants of murmur3_x64_128
+%define R1	31
+%define R2	33
+%define R3	27
+%define R4	31
+%define M	5
+%define N1	0x52dce729;DWORD
+%define N2	0x38495ab5;DWORD
+%define C1	QWORD(0x87c37b91114253d5)
+%define C2	QWORD(0x4cf5ad432745937f)
+;variables used by storing segs_digests on stack
+%define RSP_SAVE	tmp7
+%define FRAMESZ 	4*5*16		;BYTES*DWORDS*SEGS
+
+%define pref		tmp8
+%macro PREFETCH_X 1
+%define %%mem  %1
+	prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS	vmovups
+
+%define A	ymm0
+%define B	ymm1
+%define C	ymm2
+%define D	ymm3
+%define E	ymm4
+
+%define F	ymm5
+%define T0	ymm6
+%define T1	ymm7
+%define T2	ymm8
+%define T3	ymm9
+%define T4	ymm10
+%define T5	ymm11
+%define T6	ymm12
+%define T7	ymm13
+%define T8	ymm14
+%define T9	ymm15
+
+%define AA	ymm5
+%define BB	ymm6
+%define CC	ymm7
+%define DD	ymm8
+%define EE	ymm9
+%define TMP	ymm10
+%define FUN	ymm11
+%define K	ymm12
+%define W14	ymm13
+%define W15	ymm14
+%define W16	ymm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a  | b  |  c | ...|  p | (16)
+; h0 | h0 | h0 | ...| h0 |    | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 |    | Ba| Bb | Bc |...| Bp |
+; ....
+; h5 | h5 | h5 | ...| h5 |    | Ea| Eb | Ec |...| Ep |
+
+align 32
+;void mh_sha1_murmur3_x64_128_block_avx2 (const uint8_t * input_data,
+;				uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+;				uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+;				uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+;				uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 pointer to murmur3 digest
+; arg 4 number  of 1KB blocks
+;
+mk_global mh_sha1_murmur3_x64_128_block_avx2, function, internal
+func(mh_sha1_murmur3_x64_128_block_avx2)
+	endbranch
+	FUNC_SAVE
+
+	; save rsp
+	mov	RSP_SAVE, rsp
+
+	cmp	loops, 0
+	jle	.return
+
+	; leave enough space to store segs_digests
+	sub     rsp, FRAMESZ
+	; align rsp to 32 Bytes needed by avx2
+	and	rsp, ~0x1F
+
+ %assign I 0					; copy segs_digests into stack
+ %rep 2
+	VMOVPS  A, [mh_digests_p + I*32*5 + 32*0]
+	VMOVPS  B, [mh_digests_p + I*32*5 + 32*1]
+	VMOVPS  C, [mh_digests_p + I*32*5 + 32*2]
+	VMOVPS  D, [mh_digests_p + I*32*5 + 32*3]
+	VMOVPS  E, [mh_digests_p + I*32*5 + 32*4]
+
+	vmovdqa [rsp + I*32*5 + 32*0], A
+	vmovdqa [rsp + I*32*5 + 32*1], B
+	vmovdqa [rsp + I*32*5 + 32*2], C
+	vmovdqa [rsp + I*32*5 + 32*3], D
+	vmovdqa [rsp + I*32*5 + 32*4], E
+ %assign I (I+1)
+ %endrep
+
+ 	;init murmur variables
+	mov	mur_in_p, mh_in_p	;different steps between murmur and mh_sha1
+	;load murmur hash digests and multiplier
+	mov	mur_hash1, [mur_digest_p]
+	mov	mur_hash2, [mur_digest_p + 8]
+	mov	mur_c1_r,  C1
+	mov	mur_c2_r,  C2
+
+.block_loop:
+	;transform to big-endian data and store on aligned_frame
+	vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+	;transform input data from DWORD*16_SEGS*5 to DWORD*8_SEGS*5*2
+%assign I 0
+%rep 16
+	VMOVPS   T0,[mh_in_p + I*64+0*32]
+	VMOVPS   T1,[mh_in_p + I*64+1*32]
+
+	vpshufb	T0, T0, F
+	vmovdqa	[mh_data_p +I*32+0*512],T0
+	vpshufb	T1, T1, F
+	vmovdqa	[mh_data_p +I*32+1*512],T1
+%assign I (I+1)
+%endrep
+
+	mov	mh_segs, 0			;start from the first 8 segments
+	mov	pref, 1024				;avoid prefetch repeadtedly
+ .segs_loop:
+	;; Initialize digests
+	vmovdqa	A, [rsp + 0*64 + mh_segs]
+	vmovdqa	B, [rsp + 1*64 + mh_segs]
+	vmovdqa	C, [rsp + 2*64 + mh_segs]
+	vmovdqa	D, [rsp + 3*64 + mh_segs]
+	vmovdqa	E, [rsp + 4*64 + mh_segs]
+
+	vmovdqa  AA, A
+	vmovdqa  BB, B
+	vmovdqa  CC, C
+	vmovdqa  DD, D
+	vmovdqa  EE, E
+;;
+;; perform 0-79 steps
+;;
+	vmovdqa	K, [K00_19]
+;; do rounds 0...15
+ %assign I 0
+ %rep 16
+	SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+	vmovdqa	W16, [mh_data_p + ((16 - 16) & 15) * 32]
+	vmovdqa	W15, [mh_data_p + ((16 - 15) & 15) * 32]
+ %rep 4
+ %assign J (I % 2)
+	SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+	PREFETCH_X [mh_in_p + pref+128*0]
+	PREFETCH_X [mh_in_p + pref+128*1]
+;; do rounds 20...39
+	vmovdqa	K, [K20_39]
+ %rep 20
+ %assign J (I % 2)
+	SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+;; do rounds 40...59
+	vmovdqa	K, [K40_59]
+ %rep 20
+ %assign J (I % 2)
+	SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+	PREFETCH_X [mh_in_p + pref+128*2]
+        PREFETCH_X [mh_in_p + pref+128*3]
+;; do rounds 60...79
+	vmovdqa	K, [K60_79]
+ %rep 20
+ %assign J (I % 2)
+	SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+	vpaddd  A,A, AA
+	vpaddd  B,B, BB
+	vpaddd  C,C, CC
+	vpaddd  D,D, DD
+	vpaddd  E,E, EE
+
+	; write out digests
+	vmovdqa  [rsp + 0*64 + mh_segs], A
+	vmovdqa  [rsp + 1*64 + mh_segs], B
+	vmovdqa  [rsp + 2*64 + mh_segs], C
+	vmovdqa  [rsp + 3*64 + mh_segs], D
+	vmovdqa  [rsp + 4*64 + mh_segs], E
+
+	add	pref, 512
+
+	add	mh_data_p, 512
+	add 	mh_segs, 32
+	cmp	mh_segs, 64
+	jc 	.segs_loop
+
+	sub	mh_data_p, (1024)
+	add 	mh_in_p,   (1024)
+	sub     loops, 1
+	jne     .block_loop
+
+	;store murmur-hash digest
+	mov	[mur_digest_p], mur_hash1
+	mov	[mur_digest_p + 8], mur_hash2
+
+ %assign I 0					; copy segs_digests back to mh_digests_p
+ %rep 2
+	vmovdqa A, [rsp + I*32*5 + 32*0]
+	vmovdqa B, [rsp + I*32*5 + 32*1]
+	vmovdqa C, [rsp + I*32*5 + 32*2]
+	vmovdqa D, [rsp + I*32*5 + 32*3]
+	vmovdqa E, [rsp + I*32*5 + 32*4]
+
+	VMOVPS  [mh_digests_p + I*32*5 + 32*0], A
+	VMOVPS  [mh_digests_p + I*32*5 + 32*1], B
+	VMOVPS  [mh_digests_p + I*32*5 + 32*2], C
+	VMOVPS  [mh_digests_p + I*32*5 + 32*3], D
+	VMOVPS  [mh_digests_p + I*32*5 + 32*4], E
+ %assign I (I+1)
+ %endrep
+	mov	rsp, RSP_SAVE			; restore rsp
+
+.return:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data align=32
+
+align 32
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19:			dq 0x5A8279995A827999, 0x5A8279995A827999
+			dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39:                 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+			dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59:                 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+			dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79:                 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+			dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm
new file mode 100644
index 000000000..a5c157078
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm
@@ -0,0 +1,504 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using AVX-512
+;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS	vmovdqu64
+;SIMD variables definition
+%define A	zmm0
+%define B	zmm1
+%define C	zmm2
+%define D	zmm3
+%define E	zmm4
+%define HH0	zmm5
+%define HH1	zmm6
+%define HH2	zmm7
+%define HH3	zmm8
+%define HH4	zmm9
+%define KT	zmm10
+%define XTMP0	zmm11
+%define XTMP1	zmm12
+%define SHUF_MASK	zmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;using extra 16 ZMM registers to place the inverse input data
+%define W0	zmm16
+%define W1	zmm17
+%define W2	zmm18
+%define W3	zmm19
+%define W4	zmm20
+%define W5	zmm21
+%define W6	zmm22
+%define W7	zmm23
+%define W8	zmm24
+%define W9	zmm25
+%define W10	zmm26
+%define W11	zmm27
+%define W12	zmm28
+%define W13	zmm29
+%define W14	zmm30
+%define W15	zmm31
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;macros definition
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro PROCESS_LOOP 2
+%define %%WT		%1
+%define %%F_IMMED	%2
+
+	; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt
+	; E=D, D=C, C=ROTL_30(B), B=A, A=T
+
+	; Ft
+	;  0-19 Ch(B,C,D) = (B&C) ^ (~B&D)
+	; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D
+	; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D)
+
+	vmovdqa32	XTMP1, B		; Copy B
+	vpaddd		E, E, %%WT		; E = E + Wt
+	vpternlogd	XTMP1, C, D, %%F_IMMED	; TMP1 = Ft(B,C,D)
+	vpaddd		E, E, KT		; E = E + Wt + Kt
+	vprold		XTMP0, A, 5		; TMP0 = ROTL_5(A)
+	vpaddd		E, E, XTMP1		; E = Ft(B,C,D) + E + Kt + Wt
+	vprold		B, B, 30		; B = ROTL_30(B)
+	vpaddd		E, E, XTMP0		; E = T
+
+	ROTATE_ARGS
+%endmacro
+
+;; Insert murmur's instructions into this macro.
+;; Every section_loop of mh_sha1 calls PROCESS_LOOP 80 and
+;; MSG_SCHED_ROUND_16_79 64 times and processes 1024 Bytes.
+;; So insert 1 murmur block per section_loop.
+%macro PROCESS_LOOP_MUR 2
+%define %%WT		%1
+%define %%F_IMMED	%2
+
+	; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt
+	; E=D, D=C, C=ROTL_30(B), B=A, A=T
+
+	; Ft
+	;  0-19 Ch(B,C,D) = (B&C) ^ (~B&D)
+	; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D
+	; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D)
+
+	mov	mur_data1, [mur_in_p]
+	mov	mur_data2, [mur_in_p + 8]
+	vmovdqa32	XTMP1, B		; Copy B
+	imul	mur_data1, mur_c1_r
+	imul	mur_data2, mur_c2_r
+	vpaddd		E, E, %%WT		; E = E + Wt
+	rol	mur_data1, R1
+	rol	mur_data2, R2
+	vpternlogd	XTMP1, C, D, %%F_IMMED	; TMP1 = Ft(B,C,D)
+	imul	mur_data1, mur_c2_r
+	imul	mur_data2, mur_c1_r
+	vpaddd		E, E, KT		; E = E + Wt + Kt
+	xor	mur_hash1, mur_data1
+	add	mur_in_p, 16
+	vprold		XTMP0, A, 5		; TMP0 = ROTL_5(A)
+	rol	mur_hash1, R3
+	vpaddd		E, E, XTMP1		; E = Ft(B,C,D) + E + Kt + Wt
+	add	mur_hash1, mur_hash2
+	vprold		B, B, 30		; B = ROTL_30(B)
+	lea	mur_hash1, [mur_hash1 + mur_hash1*4 + N1]
+	vpaddd		E, E, XTMP0		; E = T
+	xor	mur_hash2, mur_data2
+
+	ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_79_MUR 4
+%define %%WT	%1
+%define %%WTp2	%2
+%define %%WTp8	%3
+%define %%WTp13	%4
+	; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16)
+	; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt)
+	vpternlogd	%%WT, %%WTp2, %%WTp8, 0x96
+	rol	mur_hash2, R4
+	vpxord		%%WT, %%WT, %%WTp13
+	add	mur_hash2, mur_hash1
+	lea	mur_hash2, [mur_hash2 + mur_hash2*4 + N2]
+	vprold		%%WT, %%WT, 1
+%endmacro
+
+%define APPEND(a,b) a %+ b
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+
+ %define arg4  r8d
+ %define arg5  r9
+
+ %define tmp1  r10
+ %define tmp2  r11
+ %define tmp3  r12		; must be saved and restored
+ %define tmp4  r13		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define tmp7  rbx		; must be saved and restored
+ %define tmp8  rbp		; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	push	rbx
+	push	rbp
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	rbp
+	pop	rbx
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r10d
+ %define arg5   r11
+ %define tmp1   r12		; must be saved and restored
+ %define tmp2   r13		; must be saved and restored
+ %define tmp3   r14		; must be saved and restored
+ %define tmp4   r15		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define tmp7   rbx		; must be saved and restored
+ %define tmp8   rbp		; must be saved and restored
+ %define return rax
+
+ %define stack_size  10*16 + 9*8		; must be an odd multiple of 8
+ %define PS 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ ; remove unwind info macros
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp + 0*16], xmm6
+	movdqa	[rsp + 1*16], xmm7
+	movdqa	[rsp + 2*16], xmm8
+	movdqa	[rsp + 3*16], xmm9
+	movdqa	[rsp + 4*16], xmm10
+	movdqa	[rsp + 5*16], xmm11
+	movdqa	[rsp + 6*16], xmm12
+	movdqa	[rsp + 7*16], xmm13
+	movdqa	[rsp + 8*16], xmm14
+	movdqa	[rsp + 9*16], xmm15
+	mov	[rsp + 10*16 + 0*8], r12
+	mov	[rsp + 10*16 + 1*8], r13
+	mov	[rsp + 10*16 + 2*8], r14
+	mov	[rsp + 10*16 + 3*8], r15
+	mov	[rsp + 10*16 + 4*8], rdi
+	mov	[rsp + 10*16 + 5*8], rsi
+	mov	[rsp + 10*16 + 6*8], rbx
+	mov	[rsp + 10*16 + 7*8], rbp
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	mov	rsi,  [rsp + 10*16 + 5*8]
+	mov	rbx,  [rsp + 10*16 + 6*8]
+	mov	rbp,  [rsp + 10*16 + 7*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops 		arg4
+;variables of mh_sha1
+%define mh_in_p  	arg0
+%define mh_digests_p 	arg1
+%define mh_data_p	arg2
+%define mh_segs  	tmp1
+;variables of murmur3
+%define mur_in_p  	tmp2
+%define mur_digest_p 	arg3
+%define mur_hash1	 tmp3
+%define mur_hash2	 tmp4
+%define mur_data1	 tmp5
+%define mur_data2	 return
+%define mur_c1_r	 tmp6
+%define mur_c2_r	 arg5
+; constants of murmur3_x64_128
+%define R1	31
+%define R2	33
+%define R3	27
+%define R4	31
+%define M	5
+%define N1	0x52dce729;DWORD
+%define N2	0x38495ab5;DWORD
+%define C1	QWORD(0x87c37b91114253d5)
+%define C2	QWORD(0x4cf5ad432745937f)
+;variables used by storing segs_digests on stack
+%define RSP_SAVE	tmp7
+
+%define pref		tmp8
+%macro PREFETCH_X 1
+%define %%mem  %1
+	prefetchnta %%mem
+%endmacro
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a  | b  |  c | ...|  p | (16)
+; h0 | h0 | h0 | ...| h0 |    | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 |    | Ba| Bb | Bc |...| Bp |
+; ....
+; h5 | h5 | h5 | ...| h5 |    | Ea| Eb | Ec |...| Ep |
+
+[bits 64]
+section .text
+align 32
+
+;void mh_sha1_murmur3_x64_128_block_avx512 (const uint8_t * input_data,
+;				uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+;				uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+;				uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+;				uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 pointer to murmur3 digest
+; arg 4 number  of 1KB blocks
+;
+global mh_sha1_murmur3_x64_128_block_avx512
+func(mh_sha1_murmur3_x64_128_block_avx512)
+	endbranch
+	FUNC_SAVE
+
+	; save rsp
+	mov	RSP_SAVE, rsp
+
+	cmp	loops, 0
+	jle	.return
+
+	; align rsp to 64 Bytes needed by avx512
+	and	rsp, ~0x3f
+
+	; copy segs_digests into registers.
+	VMOVPS  HH0, [mh_digests_p + 64*0]
+	VMOVPS  HH1, [mh_digests_p + 64*1]
+	VMOVPS  HH2, [mh_digests_p + 64*2]
+	VMOVPS  HH3, [mh_digests_p + 64*3]
+	VMOVPS  HH4, [mh_digests_p + 64*4]
+	;a mask used to transform to big-endian data
+	vmovdqa64 SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK]
+
+	;init murmur variables
+	mov	mur_in_p, mh_in_p	;different steps between murmur and mh_sha1
+	;load murmur hash digests and multiplier
+	mov	mur_hash1, [mur_digest_p]
+	mov	mur_hash2, [mur_digest_p + 8]
+	mov	mur_c1_r,  C1
+	mov	mur_c2_r,  C2
+
+.block_loop:
+	;transform to big-endian data and store on aligned_frame
+	;using extra 16 ZMM registers instead of stack
+%assign I 0
+%rep 8
+%assign J (I+1)
+	VMOVPS	APPEND(W,I),[mh_in_p + I*64+0*64]
+	VMOVPS	APPEND(W,J),[mh_in_p + I*64+1*64]
+
+	vpshufb	APPEND(W,I), APPEND(W,I), SHUF_MASK
+	vpshufb	APPEND(W,J), APPEND(W,J), SHUF_MASK
+%assign I (I+2)
+%endrep
+
+	vmovdqa64  A, HH0
+	vmovdqa64  B, HH1
+	vmovdqa64  C, HH2
+	vmovdqa64  D, HH3
+	vmovdqa64  E, HH4
+
+	vmovdqa32	KT, [K00_19]
+%assign I 0xCA
+%assign J 0
+%assign K 2
+%assign L 8
+%assign M 13
+%assign N 0
+%rep 80
+	%if N < 64	; stitching 64 times
+	PROCESS_LOOP_MUR  APPEND(W,J),  I
+	MSG_SCHED_ROUND_16_79_MUR  APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+	%else		; 64 <= N < 80, without stitching
+	PROCESS_LOOP  APPEND(W,J),  I
+	%endif
+	%if N = 19
+		vmovdqa32	KT, [K20_39]
+		%assign I 0x96
+	%elif N = 39
+		vmovdqa32	KT, [K40_59]
+		%assign I 0xE8
+	%elif N = 59
+		vmovdqa32	KT, [K60_79]
+		%assign I 0x96
+	%endif
+	%if N % 20 = 19
+		PREFETCH_X [mh_in_p + 1024+128*(N / 20)]
+		PREFETCH_X [mh_in_p + 1024+128*(N / 20 +1)]
+	%endif
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%assign N (N+1)
+%endrep
+
+	; Add old digest
+	vpaddd  HH0,A, HH0
+	vpaddd  HH1,B, HH1
+	vpaddd  HH2,C, HH2
+	vpaddd  HH3,D, HH3
+	vpaddd  HH4,E, HH4
+
+	add 	mh_in_p,   1024
+	sub     loops, 1
+	jne     .block_loop
+
+	;store murmur-hash digest
+	mov	[mur_digest_p], mur_hash1
+	mov	[mur_digest_p + 8], mur_hash2
+
+	; copy segs_digests to mh_digests_p
+	VMOVPS  [mh_digests_p + 64*0], HH0
+	VMOVPS  [mh_digests_p + 64*1], HH1
+	VMOVPS  [mh_digests_p + 64*2], HH2
+	VMOVPS  [mh_digests_p + 64*3], HH3
+	VMOVPS  [mh_digests_p + 64*4], HH4
+
+	mov	rsp, RSP_SAVE			; restore rsp
+
+.return:
+	FUNC_RESTORE
+	ret
+
+
+section .data align=64
+
+align 64
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203
+			 dq 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203
+			 dq 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203
+			 dq 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203
+			 dq 0x0c0d0e0f08090a0b
+
+K00_19:			dq 0x5A8279995A827999
+			dq 0x5A8279995A827999
+			dq 0x5A8279995A827999
+			dq 0x5A8279995A827999
+			dq 0x5A8279995A827999
+			dq 0x5A8279995A827999
+			dq 0x5A8279995A827999
+			dq 0x5A8279995A827999
+
+K20_39:			dq  0x6ED9EBA16ED9EBA1
+			dq  0x6ED9EBA16ED9EBA1
+			dq  0x6ED9EBA16ED9EBA1
+			dq  0x6ED9EBA16ED9EBA1
+			dq  0x6ED9EBA16ED9EBA1
+			dq  0x6ED9EBA16ED9EBA1
+			dq  0x6ED9EBA16ED9EBA1
+			dq  0x6ED9EBA16ED9EBA1
+
+K40_59:			dq  0x8F1BBCDC8F1BBCDC
+			dq  0x8F1BBCDC8F1BBCDC
+			dq  0x8F1BBCDC8F1BBCDC
+			dq  0x8F1BBCDC8F1BBCDC
+			dq  0x8F1BBCDC8F1BBCDC
+			dq  0x8F1BBCDC8F1BBCDC
+			dq  0x8F1BBCDC8F1BBCDC
+			dq  0x8F1BBCDC8F1BBCDC
+
+K60_79:			dq  0xCA62C1D6CA62C1D6
+			dq  0xCA62C1D6CA62C1D6
+			dq  0xCA62C1D6CA62C1D6
+			dq  0xCA62C1D6CA62C1D6
+			dq  0xCA62C1D6CA62C1D6
+			dq  0xCA62C1D6CA62C1D6
+			dq  0xCA62C1D6CA62C1D6
+			dq  0xCA62C1D6CA62C1D6
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_murmur3_x64_128_block_avx512
+no_sha1_murmur3_x64_128_block_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm
new file mode 100644
index 000000000..ebd1b8b49
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm
@@ -0,0 +1,702 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using SSE
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T   ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    movdqa  %%regF,%%regC
+    pxor  %%regF,%%regD
+    pand  %%regF,%%regB
+    pxor  %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T   ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    movdqa  %%regF,%%regD
+    pxor  %%regF,%%regC
+    pxor  %%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T   ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    movdqa  %%regF,%%regB
+    movdqa  %%regT,%%regB
+    por   %%regF,%%regC
+    pand  %%regT,%%regC
+    pand  %%regF,%%regD
+    por   %%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T   ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	movdqa  %%tmp, %%reg
+	pslld   %%reg, %%imm
+	psrld   %%tmp, (32-%%imm)
+	por     %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 11
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+	paddd   %%regE,%%immCNT
+	paddd   %%regE,[%%data + (%%memW * 16)]
+	movdqa  %%regT,%%regA
+	PROLD   %%regT,5, %%regF
+	paddd   %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD   %%regB,30, %%regT
+	paddd   %%regE,%%regF
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro SHA1_STEP_16_79 11
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+	paddd   %%regE,%%immCNT
+	movdqa  W14, [%%data + ((%%memW - 14) & 15) * 16]
+	pxor    W16, W14
+	pxor    W16, [%%data + ((%%memW -  8) & 15) * 16]
+	pxor    W16, [%%data + ((%%memW -  3) & 15) * 16]
+	movdqa  %%regF, W16
+	pslld   W16, 1
+	psrld   %%regF, (32-1)
+	por     %%regF, W16
+	ROTATE_W
+
+	movdqa  [%%data + ((%%memW - 0) & 15) * 16],%%regF
+	paddd   %%regE,%%regF
+	movdqa  %%regT,%%regA
+	PROLD   %%regT,5, %%regF
+	paddd   %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD   %%regB,30, %%regT
+	paddd   %%regE,%%regF
+%endmacro
+
+;; Insert murmur's instructions into this macro.
+;; Every section_loop of mh_sha1 calls SHA1_STEP_16_79 64 times and processes 256Byte.
+;; So insert 1 murmur block into every 4 SHA1_STEP_16_79.
+%define SHA1_STEP_16_79(J) SHA1_STEP_16_79_ %+ J
+
+%macro SHA1_STEP_16_79_0 11
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+
+	paddd   %%regE,%%immCNT
+	movdqa  W14, [%%data + ((%%memW - 14) & 15) * 16]
+	pxor    W16, W14
+	pxor    W16, [%%data + ((%%memW -  8) & 15) * 16]
+	pxor    W16, [%%data + ((%%memW -  3) & 15) * 16]
+	movdqa  %%regF, W16
+	mov	mur_data1, [mur_in_p]
+	mov	mur_data2, [mur_in_p + 8]
+	pslld   W16, 1
+	psrld   %%regF, (32-1)
+	por     %%regF, W16
+
+	ROTATE_W
+
+	movdqa  [%%data + ((%%memW - 0) & 15) * 16],%%regF
+	imul	mur_data1, mur_c1_r
+	paddd   %%regE,%%regF
+	movdqa  %%regT,%%regA
+	PROLD   %%regT,5, %%regF
+	paddd   %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	imul	mur_data2, mur_c2_r
+	PROLD   %%regB,30, %%regT
+	paddd   %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79_1 11
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+	paddd   %%regE,%%immCNT
+	rol	mur_data1, R1
+	movdqa  W14, [%%data + ((%%memW - 14) & 15) * 16]
+	pxor    W16, W14
+	pxor    W16, [%%data + ((%%memW -  8) & 15) * 16]
+	pxor    W16, [%%data + ((%%memW -  3) & 15) * 16]
+	movdqa  %%regF, W16
+	pslld   W16, 1
+	rol	mur_data2, R2
+	psrld   %%regF, (32-1)
+	por     %%regF, W16
+
+	ROTATE_W
+
+	movdqa  [%%data + ((%%memW - 0) & 15) * 16],%%regF
+	imul	mur_data1, mur_c2_r
+	paddd   %%regE,%%regF
+	movdqa  %%regT,%%regA
+	PROLD   %%regT,5, %%regF
+	paddd   %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	imul	mur_data2, mur_c1_r
+	PROLD   %%regB,30, %%regT
+	add	mur_in_p, 16
+	paddd   %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79_2 11
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+	paddd   %%regE,%%immCNT
+	movdqa  W14, [%%data + ((%%memW - 14) & 15) * 16]
+	xor	mur_hash1, mur_data1
+	pxor    W16, W14
+	pxor    W16, [%%data + ((%%memW -  8) & 15) * 16]
+	pxor    W16, [%%data + ((%%memW -  3) & 15) * 16]
+	rol	mur_hash1, R3
+	movdqa  %%regF, W16
+	pslld   W16, 1
+	psrld   %%regF, (32-1)
+	por     %%regF, W16
+
+	ROTATE_W
+
+	movdqa  [%%data + ((%%memW - 0) & 15) * 16],%%regF
+	add	mur_hash1, mur_hash2
+	paddd   %%regE,%%regF
+	movdqa  %%regT,%%regA
+	PROLD   %%regT,5, %%regF
+	lea	mur_hash1, [mur_hash1 + mur_hash1*4 + N1]
+	paddd   %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD   %%regB,30, %%regT
+	paddd   %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79_3 11
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+	paddd   %%regE,%%immCNT
+	movdqa  W14, [%%data + ((%%memW - 14) & 15) * 16]
+	xor	mur_hash2, mur_data2
+	pxor    W16, W14
+	pxor    W16, [%%data + ((%%memW -  8) & 15) * 16]
+	pxor    W16, [%%data + ((%%memW -  3) & 15) * 16]
+	rol	mur_hash2, R4
+	movdqa  %%regF, W16
+	pslld   W16, 1
+	psrld   %%regF, (32-1)
+	por     %%regF, W16
+
+	ROTATE_W
+
+	movdqa  [%%data + ((%%memW - 0) & 15) * 16],%%regF
+	add	mur_hash2, mur_hash1
+	paddd   %%regE,%%regF
+	movdqa  %%regT,%%regA
+	PROLD   %%regT,5, %%regF
+	paddd   %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD   %%regB,30, %%regT
+	lea	mur_hash2, [mur_hash2 + mur_hash2*4 + N2]
+	paddd   %%regE,%%regF
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+
+ %define arg4  r8d
+ %define arg5  r9
+
+ %define tmp1  r10
+ %define tmp2  r11
+ %define tmp3  r12		; must be saved and restored
+ %define tmp4  r13		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define tmp7  rbx		; must be saved and restored
+ %define tmp8  rbp		; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	push	rbx
+	push	rbp
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	rbp
+	pop	rbx
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r10d
+ %define arg5   r11
+ %define tmp1   r12		; must be saved and restored
+ %define tmp2   r13		; must be saved and restored
+ %define tmp3   r14		; must be saved and restored
+ %define tmp4   r15		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define tmp7   rbx		; must be saved and restored
+ %define tmp8   rbp		; must be saved and restored
+ %define return rax
+
+ %define stack_size  10*16 + 9*8		; must be an odd multiple of 8
+ %define PS 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	save_reg	rsi,  10*16 + 5*8
+	save_reg	rbx,  10*16 + 6*8
+	save_reg	rbp,  10*16 + 7*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	mov	rsi,  [rsp + 10*16 + 5*8]
+ 	mov	rbx,  [rsp + 10*16 + 6*8]
+	mov	rbp,  [rsp + 10*16 + 7*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops 		arg4
+;variables of mh_sha1
+%define mh_in_p  	arg0
+%define mh_digests_p 	arg1
+%define mh_data_p	arg2
+%define mh_segs  	tmp1
+;variables of murmur3
+%define mur_in_p  	tmp2
+%define mur_digest_p 	arg3
+%define mur_hash1	 tmp3
+%define mur_hash2	 tmp4
+%define mur_data1	 tmp5
+%define mur_data2	 return
+%define mur_c1_r	 tmp6
+%define mur_c2_r	 arg5
+; constants of murmur3_x64_128
+%define R1	31
+%define R2	33
+%define R3	27
+%define R4	31
+%define M	5
+%define N1	0x52dce729;DWORD
+%define N2	0x38495ab5;DWORD
+%define C1	QWORD(0x87c37b91114253d5)
+%define C2	QWORD(0x4cf5ad432745937f)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;variables used by storing segs_digests on stack
+%define RSP_SAVE	tmp7
+%define FRAMESZ 	4*5*16		;BYTES*DWORDS*SEGS
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define MOVPS   movups
+
+%define A       xmm0
+%define B       xmm1
+%define C       xmm2
+%define D       xmm3
+%define E       xmm4
+%define F       xmm5 ; tmp
+%define G       xmm6 ; tmp
+
+%define TMP     G
+%define FUN     F
+%define K       xmm7
+
+%define AA      xmm8
+%define BB      xmm9
+%define CC      xmm10
+%define DD      xmm11
+%define EE      xmm12
+
+%define T0      xmm6
+%define T1      xmm7
+%define T2      xmm8
+%define T3      xmm9
+%define T4      xmm10
+%define T5      xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14     xmm13
+%define W15     xmm14
+%define W16     xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a  | b  |  c | ...|  p | (16)
+; h0 | h0 | h0 | ...| h0 |    | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 |    | Ba| Bb | Bc |...| Bp |
+; ....
+; h5 | h5 | h5 | ...| h5 |    | Ea| Eb | Ec |...| Ep |
+
+align 32
+;void mh_sha1_murmur3_x64_128_block_sse (const uint8_t * input_data,
+;				uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+;				uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+;				uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+;				uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 pointer to murmur3 digest
+; arg 4 number  of 1KB blocks
+;
+mk_global mh_sha1_murmur3_x64_128_block_sse, function, internal
+func(mh_sha1_murmur3_x64_128_block_sse)
+	endbranch
+	FUNC_SAVE
+	; save rsp
+	mov	RSP_SAVE, rsp
+
+	cmp	loops, 0
+	jle	.return
+
+	; leave enough space to store segs_digests
+	sub     rsp, FRAMESZ
+	; align rsp to 16 Bytes needed by sse
+	and	rsp, ~0x0F
+
+ %assign I 0					; copy segs_digests into stack
+ %rep 5
+	MOVPS  A, [mh_digests_p + I*64 + 16*0]
+	MOVPS  B, [mh_digests_p + I*64 + 16*1]
+	MOVPS  C, [mh_digests_p + I*64 + 16*2]
+	MOVPS  D, [mh_digests_p + I*64 + 16*3]
+
+	movdqa [rsp + I*64 + 16*0], A
+	movdqa [rsp + I*64 + 16*1], B
+	movdqa [rsp + I*64 + 16*2], C
+	movdqa [rsp + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+
+ 	;init murmur variables
+	mov	mur_in_p, mh_in_p	;different steps between murmur and mh_sha1
+	;load murmur hash digests and multiplier
+	mov	mur_hash1, [mur_digest_p]
+	mov	mur_hash2, [mur_digest_p + 8]
+	mov	mur_c1_r,  C1
+	mov	mur_c2_r,  C2
+
+.block_loop:
+	;transform to big-endian data and store on aligned_frame
+	movdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+	;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4
+ %assign I 0
+ %rep 16
+	MOVPS   T0,[mh_in_p+I*64+0*16]
+	MOVPS   T1,[mh_in_p+I*64+1*16]
+	MOVPS   T2,[mh_in_p+I*64+2*16]
+	MOVPS   T3,[mh_in_p+I*64+3*16]
+
+	pshufb  T0, F
+	movdqa  [mh_data_p+(I)*16 +0*256],T0
+	pshufb  T1, F
+	movdqa  [mh_data_p+(I)*16 +1*256],T1
+	pshufb  T2, F
+	movdqa  [mh_data_p+(I)*16 +2*256],T2
+	pshufb  T3, F
+	movdqa  [mh_data_p+(I)*16 +3*256],T3
+ %assign I (I+1)
+ %endrep
+
+ 	mov	mh_segs, 0			;start from the first 4 segments
+ .segs_loop:
+	;; Initialize digests
+	movdqa  A, [rsp + 0*64 + mh_segs]
+	movdqa  B, [rsp + 1*64 + mh_segs]
+	movdqa  C, [rsp + 2*64 + mh_segs]
+	movdqa  D, [rsp + 3*64 + mh_segs]
+	movdqa  E, [rsp + 4*64 + mh_segs]
+
+	movdqa  AA, A
+	movdqa  BB, B
+	movdqa  CC, C
+	movdqa  DD, D
+	movdqa  EE, E
+;;
+;; perform 0-79 steps
+;;
+	movdqa  K, [K00_19]
+;; do rounds 0...15
+ %assign I 0
+ %rep 16
+	SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 16...19
+	movdqa  W16, [mh_data_p + ((16 - 16) & 15) * 16]
+	movdqa  W15, [mh_data_p + ((16 - 15) & 15) * 16]
+ %rep 4
+ %assign J (I % 4)
+	SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 20...39
+	movdqa  K, [K20_39]
+ %rep 20
+ %assign J (I % 4)
+	SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 40...59
+	movdqa  K, [K40_59]
+ %rep 20
+ %assign J (I % 4)
+	SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 60...79
+	movdqa  K, [K60_79]
+ %rep 20
+ %assign J (I % 4)
+	SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
+	ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+	paddd  A, AA
+	paddd  B, BB
+	paddd  C, CC
+	paddd  D, DD
+	paddd  E, EE
+
+	; write out digests
+	movdqa  [rsp + 0*64 + mh_segs], A
+	movdqa  [rsp + 1*64 + mh_segs], B
+	movdqa  [rsp + 2*64 + mh_segs], C
+	movdqa  [rsp + 3*64 + mh_segs], D
+	movdqa  [rsp + 4*64 + mh_segs], E
+
+	add	mh_data_p,	256
+	add 	mh_segs, 16
+	cmp	mh_segs, 64
+	jc 	.segs_loop
+
+	sub	mh_data_p, (1024)
+	add 	mh_in_p,   (1024)
+	sub     loops, 1
+	jne     .block_loop
+
+	;store murmur-hash digest
+	mov	[mur_digest_p], mur_hash1
+	mov	[mur_digest_p + 8], mur_hash2
+
+ %assign I 0					; copy segs_digests back to mh_digests_p
+ %rep 5
+	movdqa A, [rsp + I*64 + 16*0]
+	movdqa B, [rsp + I*64 + 16*1]
+	movdqa C, [rsp + I*64 + 16*2]
+	movdqa D, [rsp + I*64 + 16*3]
+
+	MOVPS  [mh_digests_p + I*64 + 16*0], A
+	MOVPS  [mh_digests_p + I*64 + 16*1], B
+	MOVPS  [mh_digests_p + I*64 + 16*2], C
+	MOVPS  [mh_digests_p + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+	mov	rsp, RSP_SAVE			; restore rsp
+
+.return:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+K00_19:                  dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39:                  dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59:                  dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79:                  dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c
new file mode 100644
index 000000000..4d09abf1d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c
@@ -0,0 +1,102 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef FINALIZE_FUNCTION
+#include <stdlib.h>		// For NULL
+#include "mh_sha1_murmur3_x64_128_internal.h"
+
+#define FINALIZE_FUNCTION		mh_sha1_murmur3_x64_128_finalize_base
+#define MH_SHA1_TAIL_FUNCTION		mh_sha1_tail_base
+#define FINALIZE_FUNCTION_SLVER
+#endif
+
+#define MURMUR_BLOCK_FUNCTION		murmur3_x64_128_block
+#define MURMUR_TAIL_FUNCTION		murmur3_x64_128_tail
+
+int FINALIZE_FUNCTION(struct mh_sha1_murmur3_x64_128_ctx *ctx, void *mh_sha1_digest,
+		      void *murmur3_x64_128_digest)
+{
+	uint8_t *partial_block_buffer, *murmur_tail_data;
+	uint64_t partial_block_len, total_len;
+	uint32_t(*mh_sha1_segs_digests)[HASH_SEGS];
+	uint8_t *aligned_frame_buffer;
+
+	if (ctx == NULL)
+		return MH_SHA1_MURMUR3_CTX_ERROR_NULL;
+
+	total_len = ctx->total_length;
+	partial_block_len = total_len % MH_SHA1_BLOCK_SIZE;
+	partial_block_buffer = ctx->partial_block_buffer;
+
+	// Calculate murmur3 firstly
+	// because mh_sha1 will change the partial_block_buffer
+	// ( partial_block_buffer = n murmur3 blocks and 1 murmur3 tail)
+	murmur_tail_data =
+	    partial_block_buffer + partial_block_len - partial_block_len % MUR_BLOCK_SIZE;
+	MURMUR_BLOCK_FUNCTION(partial_block_buffer, partial_block_len / MUR_BLOCK_SIZE,
+			      ctx->murmur3_x64_128_digest);
+	MURMUR_TAIL_FUNCTION(murmur_tail_data, total_len, ctx->murmur3_x64_128_digest);
+
+	/* mh_sha1 final */
+	aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer);
+	mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests;
+
+	MH_SHA1_TAIL_FUNCTION(partial_block_buffer, total_len, mh_sha1_segs_digests,
+			      aligned_frame_buffer, ctx->mh_sha1_digest);
+
+	/* Output  the digests of murmur3 and mh_sha1 */
+	if (mh_sha1_digest != NULL) {
+		((uint32_t *) mh_sha1_digest)[0] = ctx->mh_sha1_digest[0];
+		((uint32_t *) mh_sha1_digest)[1] = ctx->mh_sha1_digest[1];
+		((uint32_t *) mh_sha1_digest)[2] = ctx->mh_sha1_digest[2];
+		((uint32_t *) mh_sha1_digest)[3] = ctx->mh_sha1_digest[3];
+		((uint32_t *) mh_sha1_digest)[4] = ctx->mh_sha1_digest[4];
+	}
+
+	if (murmur3_x64_128_digest != NULL) {
+		((uint32_t *) murmur3_x64_128_digest)[0] = ctx->murmur3_x64_128_digest[0];
+		((uint32_t *) murmur3_x64_128_digest)[1] = ctx->murmur3_x64_128_digest[1];
+		((uint32_t *) murmur3_x64_128_digest)[2] = ctx->murmur3_x64_128_digest[2];
+		((uint32_t *) murmur3_x64_128_digest)[3] = ctx->murmur3_x64_128_digest[3];
+	}
+
+	return MH_SHA1_MURMUR3_CTX_ERROR_NONE;
+}
+
+#ifdef FINALIZE_FUNCTION_SLVER
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+
+ // Version info
+struct slver mh_sha1_murmur3_x64_128_finalize_base_slver_0000025b;
+struct slver mh_sha1_murmur3_x64_128_finalize_base_slver = { 0x025b, 0x00, 0x00 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h
new file mode 100644
index 000000000..e77837347
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h
@@ -0,0 +1,202 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA1_MURMUR3_X64_128_INTERNAL_H_
+#define _MH_SHA1_MURMUR3_X64_128_INTERNAL_H_
+
+/**
+ *  @file mh_sha1_murmur3_x64_128_internal.h
+ *  @brief mh_sha1_murmur3_x64_128 internal function prototypes and macros
+ *
+ *  Interface for mh_sha1_murmur3_x64_128 internal functions
+ *
+ */
+#include <stdint.h>
+#include "mh_sha1_internal.h"
+#include "mh_sha1_murmur3_x64_128.h"
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+ /*******************************************************************
+  * mh_sha1_murmur3_x64_128 API internal function prototypes
+  * Multiple versions of Update and Finalize functions are supplied which use
+  * multiple versions of block and tail process subfunctions.
+  ******************************************************************/
+
+ /**
+  * @brief  Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+  *
+  * This function determines what instruction sets are enabled and selects the
+  * appropriate version at runtime.
+  *
+  * @param  input_data Pointer to input data to be processed
+  * @param  mh_sha1_digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  murmur3_x64_128_digests Murmur3 digest
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+  // Each function needs an individual C or ASM file because they impact performance much.
+  //They will be called by mh_sha1_murmur3_x64_128_update_XXX.
+ void mh_sha1_murmur3_x64_128_block (const uint8_t * input_data,
+						 uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+						 uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+						 uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+						 uint32_t num_blocks);
+
+ /**
+  * @brief  Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+  *
+  * @param  input_data Pointer to input data to be processed
+  * @param  mh_sha1_digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  murmur3_x64_128_digests Murmur3 digest
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+ void mh_sha1_murmur3_x64_128_block_base (const uint8_t * input_data,
+						 uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+						 uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+						 uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+						 uint32_t num_blocks);
+
+ /**
+  * @brief  Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+  *
+  * @requires SSE
+  *
+  * @param  input_data Pointer to input data to be processed
+  * @param  mh_sha1_digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  murmur3_x64_128_digests Murmur3 digest
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+ void mh_sha1_murmur3_x64_128_block_sse (const uint8_t * input_data,
+						 uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+						 uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+						 uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+						 uint32_t num_blocks);
+
+ /**
+  * @brief  Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+  *
+  * @requires AVX
+  *
+  * @param  input_data Pointer to input data to be processed
+  * @param  mh_sha1_digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  murmur3_x64_128_digests Murmur3 digest
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+ void mh_sha1_murmur3_x64_128_block_avx (const uint8_t * input_data,
+						 uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+						 uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+						 uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+						 uint32_t num_blocks);
+
+ /**
+  * @brief  Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+  *
+  * @requires AVX2
+  *
+  * @param  input_data Pointer to input data to be processed
+  * @param  mh_sha1_digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  murmur3_x64_128_digests Murmur3 digest
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+ void mh_sha1_murmur3_x64_128_block_avx2 (const uint8_t * input_data,
+						 uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+						 uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+						 uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+						 uint32_t num_blocks);
+
+ /**
+  * @brief  Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+  *
+  * @requires AVX512
+  *
+  * @param  input_data Pointer to input data to be processed
+  * @param  mh_sha1_digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  murmur3_x64_128_digests Murmur3 digest
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+ void mh_sha1_murmur3_x64_128_block_avx512 (const uint8_t * input_data,
+						 uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+						 uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+						 uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+						 uint32_t num_blocks);
+ /*******************************************************************
+  * murmur hash API
+  ******************************************************************/
+
+ /**
+  * @brief  Calculate murmur digest of blocks which size is 16*N.
+  * @param  input_data Pointer to input data to be processed
+  * @param  num_blocks The number of blocks which size is 16.
+  * @param  murmur3_x64_128_digests Murmur3 digest
+  * @returns none
+  *
+  */
+ void murmur3_x64_128_block(const uint8_t * input_data, uint32_t num_blocks,
+				 uint32_t digests[MURMUR3_x64_128_DIGEST_WORDS]);
+
+ /**
+  * @brief  Do the tail process which is less than 16Byte.
+  * @param  tail_buffer Pointer to input data to be processed
+  * @param  total_len The total length of the input_data
+  * @param  digests Murmur3 digest
+  * @returns none
+  *
+  */
+ void murmur3_x64_128_tail(const uint8_t * tail_buffer, uint32_t total_len,
+				uint32_t digests[MURMUR3_x64_128_DIGEST_WORDS]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm
new file mode 100644
index 000000000..6f9e54cdd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm
@@ -0,0 +1,76 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf32
+ [bits 32]
+%else
+ default rel
+ [bits 64]
+
+ extern mh_sha1_murmur3_x64_128_update_sse
+ extern mh_sha1_murmur3_x64_128_update_avx
+ extern mh_sha1_murmur3_x64_128_update_avx2
+ extern mh_sha1_murmur3_x64_128_finalize_sse
+ extern mh_sha1_murmur3_x64_128_finalize_avx
+ extern mh_sha1_murmur3_x64_128_finalize_avx2
+
+ %ifdef HAVE_AS_KNOWS_AVX512
+  extern mh_sha1_murmur3_x64_128_update_avx512
+  extern mh_sha1_murmur3_x64_128_finalize_avx512
+ %endif
+
+%endif
+
+extern mh_sha1_murmur3_x64_128_update_base
+extern mh_sha1_murmur3_x64_128_finalize_base
+
+mbin_interface mh_sha1_murmur3_x64_128_update
+mbin_interface mh_sha1_murmur3_x64_128_finalize
+
+%ifidn __OUTPUT_FORMAT__, elf64
+
+ %ifdef HAVE_AS_KNOWS_AVX512
+  mbin_dispatch_init6 mh_sha1_murmur3_x64_128_update, mh_sha1_murmur3_x64_128_update_base, mh_sha1_murmur3_x64_128_update_sse, mh_sha1_murmur3_x64_128_update_avx, mh_sha1_murmur3_x64_128_update_avx2, mh_sha1_murmur3_x64_128_update_avx512
+  mbin_dispatch_init6 mh_sha1_murmur3_x64_128_finalize, mh_sha1_murmur3_x64_128_finalize_base, mh_sha1_murmur3_x64_128_finalize_sse, mh_sha1_murmur3_x64_128_finalize_avx, mh_sha1_murmur3_x64_128_finalize_avx2, mh_sha1_murmur3_x64_128_finalize_avx512
+ %else
+  mbin_dispatch_init5 mh_sha1_murmur3_x64_128_update, mh_sha1_murmur3_x64_128_update_base, mh_sha1_murmur3_x64_128_update_sse, mh_sha1_murmur3_x64_128_update_avx, mh_sha1_murmur3_x64_128_update_avx2
+  mbin_dispatch_init5 mh_sha1_murmur3_x64_128_finalize, mh_sha1_murmur3_x64_128_finalize_base, mh_sha1_murmur3_x64_128_finalize_sse, mh_sha1_murmur3_x64_128_finalize_avx, mh_sha1_murmur3_x64_128_finalize_avx2
+ %endif
+
+%else
+ mbin_dispatch_init2 mh_sha1_murmur3_x64_128_update, mh_sha1_murmur3_x64_128_update_base
+ mbin_dispatch_init2 mh_sha1_murmur3_x64_128_finalize, mh_sha1_murmur3_x64_128_finalize_base
+%endif
+
+;;;       func                 				core, ver, snum
+slversion mh_sha1_murmur3_x64_128_update,		00, 02, 0252
+slversion mh_sha1_murmur3_x64_128_finalize,		00, 02, 0253
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf.c
new file mode 100644
index 000000000..77ebb964e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf.c
@@ -0,0 +1,206 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha1_murmur3_x64_128.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Loop many times over same
+# define TEST_LEN     16*1024
+# define TEST_LOOPS   20000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+# define TEST_LEN     32*1024*1024
+# define TEST_LOOPS   100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#define TEST_MEM   TEST_LEN
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type)		func##type
+#define FUNC_TOKEN(func, type)		_FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA1_FUNC_TYPE
+#define	MH_SHA1_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION		FUNC_TOKEN(mh_sha1_murmur3_x64_128_update, MH_SHA1_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION		FUNC_TOKEN(mh_sha1_murmur3_x64_128_finalize, MH_SHA1_FUNC_TYPE)
+
+#define CHECK_RETURN(state)		do{ \
+					  if((state) != MH_SHA1_MURMUR3_CTX_ERROR_NONE){ \
+					    printf("The stitch function is failed.\n"); \
+					    return 1; \
+					  } \
+					}while(0)
+
+extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest);
+
+extern void murmur3_x64_128(const void *buffer, uint32_t len, uint64_t murmur_seed,
+			    uint32_t * murmur3_x64_128_digest);
+
+void mh_sha1_murmur3_x64_128_base(const void *buffer, uint32_t len, uint64_t murmur_seed,
+				  uint32_t * mh_sha1_digest, uint32_t * murmur3_x64_128_digest)
+{
+	mh_sha1_ref(buffer, len, mh_sha1_digest);
+	murmur3_x64_128(buffer, len, murmur_seed, murmur3_x64_128_digest);
+
+	return;
+}
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 20 == 0)
+			printf("\n");
+	}
+	if (i % 20 != 0)
+		printf("\n");
+}
+
+int compare_digests(uint32_t hash_base[SHA1_DIGEST_WORDS],
+		    uint32_t hash_test[SHA1_DIGEST_WORDS],
+		    uint32_t murmur3_base[MURMUR3_x64_128_DIGEST_WORDS],
+		    uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS])
+{
+	int i;
+	int mh_sha1_fail = 0;
+	int murmur3_fail = 0;
+
+	for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+		if (hash_test[i] != hash_base[i])
+			mh_sha1_fail++;
+	}
+
+	for (i = 0; i < MURMUR3_x64_128_DIGEST_WORDS; i++) {
+		if (murmur3_test[i] != murmur3_base[i])
+			murmur3_fail++;
+	}
+
+	if (mh_sha1_fail) {
+		printf("mh_sha1 fail test\n");
+		printf("base: ");
+		dump((char *)hash_base, 20);
+		printf("ref: ");
+		dump((char *)hash_test, 20);
+	}
+	if (murmur3_fail) {
+		printf("murmur3 fail test\n");
+		printf("base: ");
+		dump((char *)murmur3_base, 16);
+		printf("ref: ");
+		dump((char *)murmur3_test, 16);
+	}
+
+	return mh_sha1_fail + murmur3_fail;
+}
+
+int main(int argc, char *argv[])
+{
+	int i, fail = 0;
+	uint32_t hash_test[SHA1_DIGEST_WORDS], hash_base[SHA1_DIGEST_WORDS];
+	uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS],
+	    murmur3_base[MURMUR3_x64_128_DIGEST_WORDS];
+	uint8_t *buff = NULL;
+	struct mh_sha1_murmur3_x64_128_ctx *update_ctx = NULL;
+	struct perf start, stop;
+
+	printf(xstr(TEST_UPDATE_FUNCTION) "_perf:\n");
+
+	buff = malloc(TEST_LEN);
+	update_ctx = malloc(sizeof(*update_ctx));
+
+	if (buff == NULL || update_ctx == NULL) {
+		printf("malloc failed test aborted\n");
+		return -1;
+	}
+	// Rand test1
+	rand_buffer(buff, TEST_LEN);
+
+	// mh_sha1_murmur3 base version
+	mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base, murmur3_base);
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS / 10; i++) {
+		mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base,
+					     murmur3_base);
+	}
+	perf_stop(&stop);
+	printf("mh_sha1_murmur3_x64_128_base" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_MEM * i);
+
+	//Update feature test
+	CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+	CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+	CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++) {
+		CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+		CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+	}
+	perf_stop(&stop);
+	printf(xstr(TEST_UPDATE_FUNCTION) TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_MEM * i);
+
+	// Check results
+	fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+	if (fail) {
+		printf("Fail size=%d\n", TEST_LEN);
+		return -1;
+	}
+
+	if (fail)
+		printf("Test failed function test%d\n", fail);
+	else
+		printf("Pass func check\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test.c
new file mode 100644
index 000000000..22ab6d1f9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test.c
@@ -0,0 +1,248 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha1_murmur3_x64_128.h"
+
+#define TEST_LEN   16*1024
+#define TEST_SIZE   8*1024
+#define TEST_MEM   TEST_LEN
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type)		func##type
+#define FUNC_TOKEN(func, type)		_FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA1_FUNC_TYPE
+#define	MH_SHA1_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION		FUNC_TOKEN(mh_sha1_murmur3_x64_128_update, MH_SHA1_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION		FUNC_TOKEN(mh_sha1_murmur3_x64_128_finalize, MH_SHA1_FUNC_TYPE)
+
+#define CHECK_RETURN(state)		do{ \
+					  if((state) != MH_SHA1_MURMUR3_CTX_ERROR_NONE){ \
+					    printf("The stitch function is failed.\n"); \
+					    return 1; \
+					  } \
+					}while(0)
+
+extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest);
+
+extern void murmur3_x64_128(const void *buffer, uint32_t len, uint64_t murmur_seed,
+			    uint32_t * murmur3_x64_128_digest);
+
+void mh_sha1_murmur3_x64_128_base(const void *buffer, uint32_t len, uint64_t murmur_seed,
+				  uint32_t * mh_sha1_digest, uint32_t * murmur3_x64_128_digest)
+{
+	mh_sha1_ref(buffer, len, mh_sha1_digest);
+	murmur3_x64_128(buffer, len, murmur_seed, murmur3_x64_128_digest);
+
+	return;
+}
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 20 == 0)
+			printf("\n");
+	}
+	if (i % 20 != 0)
+		printf("\n");
+}
+
+int compare_digests(uint32_t hash_base[SHA1_DIGEST_WORDS],
+		    uint32_t hash_test[SHA1_DIGEST_WORDS],
+		    uint32_t murmur3_base[MURMUR3_x64_128_DIGEST_WORDS],
+		    uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS])
+{
+	int i;
+	int mh_sha1_fail = 0;
+	int murmur3_fail = 0;
+
+	for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+		if (hash_test[i] != hash_base[i])
+			mh_sha1_fail++;
+	}
+
+	for (i = 0; i < MURMUR3_x64_128_DIGEST_WORDS; i++) {
+		if (murmur3_test[i] != murmur3_base[i])
+			murmur3_fail++;
+	}
+
+	if (mh_sha1_fail) {
+		printf("mh_sha1 fail test\n");
+		printf("base: ");
+		dump((char *)hash_base, 20);
+		printf("ref: ");
+		dump((char *)hash_test, 20);
+	}
+	if (murmur3_fail) {
+		printf("murmur3 fail test\n");
+		printf("base: ");
+		dump((char *)murmur3_base, 16);
+		printf("ref: ");
+		dump((char *)murmur3_test, 16);
+	}
+
+	return mh_sha1_fail + murmur3_fail;
+}
+
+int main(int argc, char *argv[])
+{
+	int fail = 0;
+	uint32_t hash_test[SHA1_DIGEST_WORDS], hash_base[SHA1_DIGEST_WORDS];
+	uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS],
+	    murmur3_base[MURMUR3_x64_128_DIGEST_WORDS];
+	uint8_t *buff = NULL;
+	int size, offset;
+	struct mh_sha1_murmur3_x64_128_ctx *update_ctx = NULL;
+
+	printf(" " xstr(TEST_UPDATE_FUNCTION) "_test:");
+
+	srand(TEST_SEED);
+
+	buff = malloc(TEST_LEN);
+	update_ctx = malloc(sizeof(*update_ctx));
+
+	if (buff == NULL || update_ctx == NULL) {
+		printf("malloc failed test aborted\n");
+		return -1;
+	}
+	// Rand test1
+	rand_buffer(buff, TEST_LEN);
+
+	mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base, murmur3_base);
+
+	CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+	CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+	CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+	fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+	if (fail) {
+		printf("fail rand1 test\n");
+		return -1;
+	} else
+		putchar('.');
+
+	// Test various size messages
+	for (size = TEST_LEN; size >= 0; size--) {
+
+		// Fill with rand data
+		rand_buffer(buff, size);
+
+		mh_sha1_murmur3_x64_128_base(buff, size, TEST_SEED, hash_base, murmur3_base);
+
+		CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size));
+		CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+		fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+		if (fail) {
+			printf("Fail size=%d\n", size);
+			return -1;
+		}
+
+		if ((size & 0xff) == 0) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	// Test various buffer offsets and sizes
+	printf("offset tests");
+	for (size = TEST_LEN - 256; size > 256; size -= 11) {
+		for (offset = 0; offset < 256; offset++) {
+			mh_sha1_murmur3_x64_128_base(buff + offset, size, TEST_SEED,
+						     hash_base, murmur3_base);
+
+			CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+			CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size));
+			CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+			fail =
+			    compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+			if (fail) {
+				printf("Fail size=%d offset=%d\n", size, offset);
+				return -1;
+			}
+
+		}
+		if ((size & 0xf) == 0) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	// Run efence tests
+	printf("efence tests");
+	for (size = TEST_SIZE; size > 0; size--) {
+		offset = TEST_LEN - size;
+		mh_sha1_murmur3_x64_128_base(buff + offset, size, TEST_SEED,
+					     hash_base, murmur3_base);
+
+		CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size));
+		CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+		fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+		if (fail) {
+			printf("Fail size=%d offset=%d\n", size, offset);
+			return -1;
+		}
+
+		if ((size & 0xf) == 0) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	printf("\n" xstr(TEST_UPDATE_FUNCTION) "_test: %s\n", fail == 0 ? "Pass" : "Fail");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c
new file mode 100644
index 000000000..0e7a3970d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c
@@ -0,0 +1,107 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef UPDATE_FUNCTION
+#include "mh_sha1_murmur3_x64_128_internal.h"
+#include <string.h>
+
+#define UPDATE_FUNCTION			mh_sha1_murmur3_x64_128_update_base
+#define BLOCK_FUNCTION			mh_sha1_murmur3_x64_128_block_base
+#define UPDATE_FUNCTION_SLVER
+#endif
+
+int UPDATE_FUNCTION(struct mh_sha1_murmur3_x64_128_ctx *ctx, const void *buffer, uint32_t len)
+{
+
+	uint8_t *partial_block_buffer;
+	uint64_t partial_block_len;
+	uint64_t num_blocks;
+	uint32_t(*mh_sha1_segs_digests)[HASH_SEGS];
+	uint8_t *aligned_frame_buffer;
+	uint32_t *murmur3_x64_128_digest;
+	const uint8_t *input_data = (const uint8_t *)buffer;
+
+	if (ctx == NULL)
+		return MH_SHA1_MURMUR3_CTX_ERROR_NULL;
+
+	if (len == 0)
+		return MH_SHA1_MURMUR3_CTX_ERROR_NONE;
+
+	partial_block_len = ctx->total_length % MH_SHA1_BLOCK_SIZE;
+	partial_block_buffer = ctx->partial_block_buffer;
+	aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer);
+	mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests;
+	murmur3_x64_128_digest = ctx->murmur3_x64_128_digest;
+
+	ctx->total_length += len;
+	// No enough input data for mh_sha1 calculation
+	if (len + partial_block_len < MH_SHA1_BLOCK_SIZE) {
+		memcpy(partial_block_buffer + partial_block_len, input_data, len);
+		return MH_SHA1_MURMUR3_CTX_ERROR_NONE;
+	}
+	// mh_sha1 calculation for the previous partial block
+	if (partial_block_len != 0) {
+		memcpy(partial_block_buffer + partial_block_len, input_data,
+		       MH_SHA1_BLOCK_SIZE - partial_block_len);
+		//do one_block process
+		BLOCK_FUNCTION(partial_block_buffer, mh_sha1_segs_digests,
+			       aligned_frame_buffer, murmur3_x64_128_digest, 1);
+		input_data += MH_SHA1_BLOCK_SIZE - partial_block_len;
+		len -= MH_SHA1_BLOCK_SIZE - partial_block_len;
+		memset(partial_block_buffer, 0, MH_SHA1_BLOCK_SIZE);
+	}
+	// Calculate mh_sha1 for the current blocks
+	num_blocks = len / MH_SHA1_BLOCK_SIZE;
+	if (num_blocks > 0) {
+		//do num_blocks process
+		BLOCK_FUNCTION(input_data, mh_sha1_segs_digests, aligned_frame_buffer,
+			       murmur3_x64_128_digest, num_blocks);
+		len -= num_blocks * MH_SHA1_BLOCK_SIZE;
+		input_data += num_blocks * MH_SHA1_BLOCK_SIZE;
+	}
+	// Store the partial block
+	if (len != 0) {
+		memcpy(partial_block_buffer, input_data, len);
+	}
+
+	return MH_SHA1_MURMUR3_CTX_ERROR_NONE;
+
+}
+
+#ifdef UPDATE_FUNCTION_SLVER
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+
+ // Version info
+struct slver mh_sha1_murmur3_x64_128_update_base_slver_0000025a;
+struct slver mh_sha1_murmur3_x64_128_update_base_slver = { 0x025a, 0x00, 0x00 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test.c
new file mode 100644
index 000000000..6ae888e21
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test.c
@@ -0,0 +1,272 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha1_murmur3_x64_128.h"
+
+#define TEST_LEN   16*1024
+#define TEST_SIZE   8*1024
+#define TEST_MEM   TEST_LEN
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type)		func##type
+#define FUNC_TOKEN(func, type)		_FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA1_FUNC_TYPE
+#define	MH_SHA1_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION		FUNC_TOKEN(mh_sha1_murmur3_x64_128_update, MH_SHA1_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION		FUNC_TOKEN(mh_sha1_murmur3_x64_128_finalize, MH_SHA1_FUNC_TYPE)
+
+#define CHECK_RETURN(state)		do{ \
+					  if((state) != MH_SHA1_MURMUR3_CTX_ERROR_NONE){ \
+					    printf("The stitch function is failed.\n"); \
+					    return 1; \
+					  } \
+					}while(0)
+
+extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest);
+
+extern void murmur3_x64_128(const void *buffer, uint32_t len, uint64_t murmur_seed,
+			    uint32_t * murmur3_x64_128_digest);
+
+void mh_sha1_murmur3_x64_128_base(const void *buffer, uint32_t len, uint64_t murmur_seed,
+				  uint32_t * mh_sha1_digest, uint32_t * murmur3_x64_128_digest)
+{
+	mh_sha1_ref(buffer, len, mh_sha1_digest);
+	murmur3_x64_128(buffer, len, murmur_seed, murmur3_x64_128_digest);
+
+	return;
+}
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 20 == 0)
+			printf("\n");
+	}
+	if (i % 20 != 0)
+		printf("\n");
+}
+
+int compare_digests(uint32_t hash_base[SHA1_DIGEST_WORDS],
+		    uint32_t hash_test[SHA1_DIGEST_WORDS],
+		    uint32_t murmur3_base[MURMUR3_x64_128_DIGEST_WORDS],
+		    uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS])
+{
+	int i;
+	int mh_sha1_fail = 0;
+	int murmur3_fail = 0;
+
+	for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+		if (hash_test[i] != hash_base[i])
+			mh_sha1_fail++;
+	}
+
+	for (i = 0; i < MURMUR3_x64_128_DIGEST_WORDS; i++) {
+		if (murmur3_test[i] != murmur3_base[i])
+			murmur3_fail++;
+	}
+
+	if (mh_sha1_fail) {
+		printf("mh_sha1 fail test\n");
+		printf("base: ");
+		dump((char *)hash_base, 20);
+		printf("ref: ");
+		dump((char *)hash_test, 20);
+	}
+	if (murmur3_fail) {
+		printf("murmur3 fail test\n");
+		printf("base: ");
+		dump((char *)murmur3_base, 16);
+		printf("ref: ");
+		dump((char *)murmur3_test, 16);
+	}
+
+	return mh_sha1_fail + murmur3_fail;
+}
+
+int main(int argc, char *argv[])
+{
+	int fail = 0, i;
+	uint32_t hash_test[SHA1_DIGEST_WORDS], hash_base[SHA1_DIGEST_WORDS];
+	uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS],
+	    murmur3_base[MURMUR3_x64_128_DIGEST_WORDS];
+	uint8_t *buff = NULL;
+	int update_count;
+	int size1, size2, offset, addr_offset;
+	struct mh_sha1_murmur3_x64_128_ctx *update_ctx = NULL;
+	uint8_t *mem_addr = NULL;
+
+	printf(" " xstr(TEST_UPDATE_FUNCTION) "_test:");
+
+	srand(TEST_SEED);
+
+	buff = malloc(TEST_LEN);
+	update_ctx = malloc(sizeof(*update_ctx));
+
+	if (buff == NULL || update_ctx == NULL) {
+		printf("malloc failed test aborted\n");
+		return -1;
+	}
+	// Rand test1
+	rand_buffer(buff, TEST_LEN);
+
+	mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base, murmur3_base);
+
+	CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+	CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+	CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+	fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+	if (fail) {
+		printf("fail rand1 test\n");
+		return -1;
+	} else
+		putchar('.');
+
+	// Test various size messages by update twice.
+	printf("\n various size messages by update twice tests");
+	for (size1 = TEST_LEN; size1 >= 0; size1--) {
+
+		// Fill with rand data
+		rand_buffer(buff, TEST_LEN);
+
+		mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base,
+					     murmur3_base);
+
+		// subsequent update
+		size2 = TEST_LEN - size1;	// size2 is different with the former
+		CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size1));
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + size1, size2));
+		CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+		fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+		if (fail) {
+			printf("Fail size1=%d\n", size1);
+			return -1;
+		}
+
+		if ((size2 & 0xff) == 0) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	// Test various update count
+	printf("\n various update count tests");
+	for (update_count = 1; update_count <= TEST_LEN; update_count++) {
+
+		// Fill with rand data
+		rand_buffer(buff, TEST_LEN);
+
+		mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base,
+					     murmur3_base);
+
+		// subsequent update
+		size1 = TEST_LEN / update_count;
+		size2 = TEST_LEN - size1 * (update_count - 1);	// size2 is different with the former
+
+		CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+		for (i = 1, offset = 0; i < update_count; i++) {
+			CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size1));
+			offset += size1;
+		}
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size2));
+		CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+		fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+		if (fail) {
+			printf("Fail size1=%d\n", size1);
+			return -1;
+		}
+
+		if ((size2 & 0xff) == 0) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	// test various start address of ctx.
+	printf("\n various start address of ctx test");
+	free(update_ctx);
+	mem_addr = (uint8_t *) malloc(sizeof(*update_ctx) + AVX512_ALIGNED * 10);
+	for (addr_offset = AVX512_ALIGNED * 10; addr_offset >= 0; addr_offset--) {
+
+		// Fill with rand data
+		rand_buffer(buff, TEST_LEN);
+
+		mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base,
+					     murmur3_base);
+
+		// a unaligned offset
+		update_ctx = (struct mh_sha1_murmur3_x64_128_ctx *)(mem_addr + addr_offset);
+		CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+		CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+		fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+		if (fail) {
+			printf("Fail addr_offset=%d\n", addr_offset);
+			return -1;
+		}
+
+		if ((addr_offset & 0xf) == 0) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	printf("\n" xstr(TEST_UPDATE_FUNCTION) "_test: %s\n", fail == 0 ? "Pass" : "Fail");
+
+	return fail;
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128.c
new file mode 100644
index 000000000..f5fe30a83
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128.c
@@ -0,0 +1,85 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>		// for NULL
+#include "murmur3_x64_128_internal.c"
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX2 __attribute__ ((optimize(1)))
+#else
+# define OPT_FIX2
+#endif
+
+/*******************************************************************
+ * Single API which can calculate murmur3
+ ******************************************************************/
+/**
+ * @brief Get the digest of murmur3_x64_128 through a single API.
+ *
+ * Using murmur3_x64_128_block and murmur3_x64_128_tail.
+ * Used to test the murmur3_x64_128 digest.
+ *
+ * @param  buffer Pointer to buffer to be processed
+ * @param  len Length of buffer (in bytes) to be processed
+ * @param  murmur_seed Seed as an initial digest of murmur3
+ * @param  murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns none
+ *
+ */
+void OPT_FIX2 murmur3_x64_128(const void *buffer, uint32_t len, uint64_t murmur_seed,
+			      uint32_t * murmur3_x64_128_digest)
+{
+	uint64_t *murmur3_x64_128_hash;
+	uint32_t murmur3_x64_128_hash_dword[4];
+	uint8_t *tail_buffer;
+	const uint8_t *input_data = (const uint8_t *)buffer;
+
+	// Initiate murmur3
+	murmur3_x64_128_hash = (uint64_t *) murmur3_x64_128_hash_dword;
+	murmur3_x64_128_hash[0] = murmur_seed;
+	murmur3_x64_128_hash[1] = murmur_seed;
+
+	// process bodies
+	murmur3_x64_128_block((uint8_t *) input_data, len / MUR_BLOCK_SIZE,
+			      murmur3_x64_128_hash_dword);
+
+	// process finalize
+	tail_buffer = (uint8_t *) input_data + len - len % MUR_BLOCK_SIZE;
+	murmur3_x64_128_tail(tail_buffer, len, murmur3_x64_128_hash_dword);
+
+	// output the digests
+	if (murmur3_x64_128_digest != NULL) {
+		murmur3_x64_128_digest[0] = murmur3_x64_128_hash_dword[0];
+		murmur3_x64_128_digest[1] = murmur3_x64_128_hash_dword[1];
+		murmur3_x64_128_digest[2] = murmur3_x64_128_hash_dword[2];
+		murmur3_x64_128_digest[3] = murmur3_x64_128_hash_dword[3];
+	}
+
+	return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c
new file mode 100644
index 000000000..67eabd0c4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c
@@ -0,0 +1,138 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "mh_sha1_murmur3_x64_128_internal.h"
+#include <stdlib.h>		// for NULL
+
+/* murmur3_x64_128 constants */
+// Shift bits of circle rotate
+#define MUR_SH1		31
+#define MUR_SH2		33
+#define MUR_SH3		27
+#define MUR_SH4		31
+#define MUR_SH5		33
+
+#define MUR_MUL		5
+#define MUR_ADD1	0x52dce729
+#define MUR_ADD2	0x38495ab5
+
+#define MUR_CON1	0x87c37b91114253d5LLU
+#define MUR_CON2	0x4cf5ad432745937fLLU
+
+#define MUR_FMUL1	0xff51afd7ed558ccdLLU
+#define MUR_FMUL2	0xc4ceb9fe1a85ec53LLU
+
+/* murmur3_x64_128 inline functions */
+static inline uint64_t blockmix64(uint64_t data, uint64_t conA, uint64_t conB, uint64_t shift)
+{
+	data *= conA;
+	data = (data << shift) | (data >> (64 - shift));
+	data *= conB;
+	return data;
+}
+
+static inline uint64_t hashmix64(uint64_t hashA, uint64_t hashB, uint64_t data, uint64_t add,
+				 uint64_t shift)
+{
+	hashA ^= data;
+	hashA = (hashA << shift) | (hashA >> (64 - shift));
+	hashA += hashB;
+	hashA = hashA * MUR_MUL + add;
+	return hashA;
+}
+
+void murmur3_x64_128_block(const uint8_t * input_data, uint32_t num_blocks,
+			   uint32_t digests[MURMUR3_x64_128_DIGEST_WORDS])
+{
+	uint64_t data1, data2;
+	uint64_t *input_qword = (uint64_t *) input_data;
+	uint64_t *hash = (uint64_t *) digests;
+	uint32_t i = 0;
+
+	while (i < num_blocks) {
+		data1 = input_qword[i * 2];
+		data2 = input_qword[i * 2 + 1];
+		data1 = blockmix64(data1, MUR_CON1, MUR_CON2, MUR_SH1);
+		data2 = blockmix64(data2, MUR_CON2, MUR_CON1, MUR_SH2);
+		hash[0] = hashmix64(hash[0], hash[1], data1, MUR_ADD1, MUR_SH3);
+		hash[1] = hashmix64(hash[1], hash[0], data2, MUR_ADD2, MUR_SH4);
+		i++;
+	}
+
+	return;
+}
+
+void murmur3_x64_128_tail(const uint8_t * tail_buffer, uint32_t total_len,
+			  uint32_t digests[MURMUR3_x64_128_DIGEST_WORDS])
+{
+	uint64_t data1, data2;
+	uint64_t *hash = (uint64_t *) digests;
+	uint64_t tail_len = total_len % 16;
+	uint8_t *tail = (uint8_t *) tail_buffer;
+
+	union {
+		uint64_t hash[2];
+		uint8_t hashB[16];
+	} hashU;
+
+	// tail
+	hashU.hash[0] = hashU.hash[1] = 0;
+
+	while (tail_len-- > 0)
+		hashU.hashB[tail_len] = tail[tail_len];
+
+	data1 = hashU.hash[0];
+	data2 = hashU.hash[1];
+
+	data1 = blockmix64(data1, MUR_CON1, MUR_CON2, MUR_SH1);
+	data2 = blockmix64(data2, MUR_CON2, MUR_CON1, MUR_SH2);
+
+	hash[0] ^= total_len ^ data1;
+	hash[1] ^= total_len ^ data2;
+
+	hash[0] += hash[1];
+	hash[1] += hash[0];
+
+	hash[0] ^= hash[0] >> MUR_SH5;
+	hash[0] *= MUR_FMUL1;
+	hash[0] ^= hash[0] >> MUR_SH5;
+	hash[0] *= MUR_FMUL2;
+	hash[0] ^= hash[0] >> MUR_SH5;
+
+	hash[1] ^= hash[1] >> MUR_SH5;
+	hash[1] *= MUR_FMUL1;
+	hash[1] ^= hash[1] >> MUR_SH5;
+	hash[1] *= MUR_FMUL2;
+	hash[1] ^= hash[1] >> MUR_SH5;
+
+	hash[0] += hash[1];
+	hash[1] += hash[0];
+
+	return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/Makefile.am b/src/crypto/isa-l/isa-l_crypto/mh_sha256/Makefile.am
new file mode 100644
index 000000000..d6e8b61ab
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/Makefile.am
@@ -0,0 +1,88 @@
+########################################################################
+#  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_sha256     =	mh_sha256/sha256_for_mh_sha256.c
+
+lsrc_mh_sha256  =	mh_sha256/mh_sha256.c \
+		mh_sha256/mh_sha256_block_sse.asm \
+		mh_sha256/mh_sha256_block_avx.asm \
+		mh_sha256/mh_sha256_block_avx2.asm \
+		mh_sha256/mh_sha256_multibinary.asm \
+		mh_sha256/mh_sha256_finalize_base.c \
+		mh_sha256/mh_sha256_update_base.c \
+		mh_sha256/mh_sha256_block_base.c
+
+lsrc_mh_sha256 += mh_sha256/mh_sha256_block_avx512.asm \
+		mh_sha256/mh_sha256_avx512.c
+
+lsrc_x86_64    += $(lsrc_sha256) \
+		$(lsrc_mh_sha256)
+
+lsrc_x86_32    += $(lsrc_x86_64)
+
+other_src   += 	mh_sha256/mh_sha256_ref.c \
+		include/reg_sizes.asm \
+		include/multibinary.asm \
+		include/test.h \
+		mh_sha256/mh_sha256_internal.h
+
+lsrc_aarch64 += $(lsrc_sha256)	\
+		mh_sha256/aarch64/mh_sha256_multibinary.S \
+		mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c \
+		mh_sha256/aarch64/mh_sha256_block_ce.S \
+		mh_sha256/aarch64/mh_sha256_ce.c \
+		mh_sha256/mh_sha256.c \
+		mh_sha256/mh_sha256_finalize_base.c \
+		mh_sha256/mh_sha256_update_base.c \
+		mh_sha256/mh_sha256_block_base.c
+
+lsrc_base_aliases += $(lsrc_sha256)	\
+		mh_sha256/mh_sha256_base_aliases.c \
+		mh_sha256/mh_sha256.c \
+		mh_sha256/mh_sha256_finalize_base.c \
+		mh_sha256/mh_sha256_update_base.c \
+		mh_sha256/mh_sha256_block_base.c
+
+src_include += -I $(srcdir)/mh_sha256
+
+extern_hdrs +=	include/mh_sha256.h
+
+check_tests += 	mh_sha256/mh_sha256_test
+unit_tests  += 	mh_sha256/mh_sha256_update_test
+
+perf_tests  += 	mh_sha256/mh_sha256_perf
+
+
+mh_sha256_test: mh_sha256_ref.o
+mh_sha256_mh_sha256_test_LDADD = mh_sha256/mh_sha256_ref.lo libisal_crypto.la
+
+mh_sha256_update_test: mh_sha256_ref.o
+mh_sha256_mh_sha256_update_test_LDADD = mh_sha256/mh_sha256_ref.lo libisal_crypto.la
+
+mh_sha256_mh_sha256_perf_LDADD = libisal_crypto.la
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c
new file mode 100644
index 000000000..155790fc1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c
@@ -0,0 +1,49 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(mh_sha256_update)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SHA2)
+		return PROVIDER_INFO(mh_sha256_update_ce);
+
+	return PROVIDER_BASIC(mh_sha256_update);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(mh_sha256_finalize)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SHA2)
+		return PROVIDER_INFO(mh_sha256_finalize_ce);
+
+	return PROVIDER_BASIC(mh_sha256_finalize);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_block_ce.S b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_block_ce.S
new file mode 100644
index 000000000..53a78ea7d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_block_ce.S
@@ -0,0 +1,731 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+	.align	6
+
+	.global	mh_sha256_block_ce
+	.type	mh_sha256_block_ce, %function
+
+/*
+Macros
+*/
+
+.macro	declare_vector_reg name:req,reg:req,default:req
+	\name		.req	\default\reg
+	q_\name		.req	q\reg
+	v_\name		.req	v\reg
+	s_\name		.req	s\reg
+.endm
+
+declare_vector_reg	lane0_msg0,	0,v
+declare_vector_reg	lane1_msg0,	1,v
+declare_vector_reg	lane2_msg0,	2,v
+declare_vector_reg	lane3_msg0,	3,v
+
+declare_vector_reg	lane0_msg1,	4,v
+declare_vector_reg	lane1_msg1,	5,v
+declare_vector_reg	lane2_msg1,	6,v
+declare_vector_reg	lane3_msg1,	7,v
+
+declare_vector_reg	lane0_msg2,	8,v
+declare_vector_reg	lane1_msg2,	9,v
+declare_vector_reg	lane2_msg2,	10,v
+declare_vector_reg	lane3_msg2,	11,v
+
+declare_vector_reg	lane0_msg3,	12,v
+declare_vector_reg	lane1_msg3,	13,v
+declare_vector_reg	lane2_msg3,	14,v
+declare_vector_reg	lane3_msg3,	15,v
+
+declare_vector_reg	lane0_state0,	16,v
+declare_vector_reg	lane1_state0,	17,v
+declare_vector_reg	lane2_state0,	18,v
+declare_vector_reg	lane3_state0,	19,v
+
+declare_vector_reg	lane0_state1,	20,v
+declare_vector_reg	lane1_state1,	21,v
+declare_vector_reg	lane2_state1,	22,v
+declare_vector_reg	lane3_state1,	23,v
+
+declare_vector_reg	lane0_tmp0,	24,v
+declare_vector_reg	lane1_tmp0,	25,v
+declare_vector_reg	lane2_tmp0,	26,v
+declare_vector_reg	lane3_tmp0,	27,v
+
+declare_vector_reg	lane0_tmp2,	28,v
+declare_vector_reg	lane1_tmp2,	29,v
+declare_vector_reg	lane2_tmp2,	30,v
+declare_vector_reg	lane3_tmp2,	31,v
+
+declare_vector_reg	key,		27,v
+declare_vector_reg	tmp,		29,v
+
+/*
+void mh_sha256_block_ce(const uint8_t * input_data,
+			uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+			uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE],
+			uint32_t num_blocks);
+*/
+	x_input_data		.req	x0
+	x_digests		.req	x1
+	x_frame_buffer		.req	x2
+	w_num_blocks		.req	w3
+
+	x_digest_addr		.req	x4
+	x_key_addr		.req	x5
+	x_msg_addr		.req	x6
+	x_lane_offs		.req	x7
+	x_offs			.req	x9
+	w_input_data_end	.req	w10
+	x_input_data_end	.req	x10
+	x_tmp			.req	x11
+mh_sha256_block_ce:
+	cbz		w_num_blocks, .exit
+	mov		w_input_data_end, w_num_blocks
+
+	ubfiz		x_input_data_end, x_input_data_end, 10, 32
+	add		x_input_data_end, x_input_data, x_input_data_end
+
+	adrp		x_key_addr, .key_addr
+	add		x_key_addr, x_key_addr, :lo12:.key_addr
+
+	stp		d8, d9, [sp, -192]!
+
+	stp		d10, d11, [sp, 16]
+	stp		d12, d13, [sp, 32]
+	stp		d14, d15, [sp, 48]
+
+	.p2align 3,,7
+.start_loop:
+	mov		x_lane_offs, 0
+	mov		x_digest_addr, x_digests
+
+.lane_loop:
+	add		x_msg_addr, x_input_data, x_lane_offs, lsl 2
+
+	.p2align 3,,7
+	mov		x_offs, 64
+	mov		x_tmp, x_digest_addr
+	ld4		{v_lane0_state0.S-v_lane3_state0.S}[0], [x_tmp], x_offs
+	ld4		{v_lane0_state0.S-v_lane3_state0.S}[1], [x_tmp], x_offs
+	ld4		{v_lane0_state0.S-v_lane3_state0.S}[2], [x_tmp], x_offs
+	ld4		{v_lane0_state0.S-v_lane3_state0.S}[3], [x_tmp], x_offs
+
+	add		x_tmp, x_digest_addr, 256
+	ld4		{v_lane0_state1.S-v_lane3_state1.S}[0], [x_tmp], x_offs
+	ld4		{v_lane0_state1.S-v_lane3_state1.S}[1], [x_tmp], x_offs
+	ld4		{v_lane0_state1.S-v_lane3_state1.S}[2], [x_tmp], x_offs
+	ld4		{v_lane0_state1.S-v_lane3_state1.S}[3], [x_tmp], x_offs
+
+	ld4		{v_lane0_msg0.S-v_lane3_msg0.S}[0], [x_msg_addr], x_offs
+	ld4		{v_lane0_msg0.S-v_lane3_msg0.S}[1], [x_msg_addr], x_offs
+	ld4		{v_lane0_msg0.S-v_lane3_msg0.S}[2], [x_msg_addr], x_offs
+	ld4		{v_lane0_msg0.S-v_lane3_msg0.S}[3], [x_msg_addr], x_offs
+
+	ld4		{v_lane0_msg1.S-v_lane3_msg1.S}[0], [x_msg_addr], x_offs
+	ld4		{v_lane0_msg1.S-v_lane3_msg1.S}[1], [x_msg_addr], x_offs
+	ld4		{v_lane0_msg1.S-v_lane3_msg1.S}[2], [x_msg_addr], x_offs
+	ld4		{v_lane0_msg1.S-v_lane3_msg1.S}[3], [x_msg_addr], x_offs
+
+	ld4		{v_lane0_msg2.S-v_lane3_msg2.S}[0], [x_msg_addr], x_offs
+	ld4		{v_lane0_msg2.S-v_lane3_msg2.S}[1], [x_msg_addr], x_offs
+	ld4		{v_lane0_msg2.S-v_lane3_msg2.S}[2], [x_msg_addr], x_offs
+	ld4		{v_lane0_msg2.S-v_lane3_msg2.S}[3], [x_msg_addr], x_offs
+
+	ld4		{v_lane0_msg3.S-v_lane3_msg3.S}[0], [x_msg_addr], x_offs
+	ld4		{v_lane0_msg3.S-v_lane3_msg3.S}[1], [x_msg_addr], x_offs
+	ld4		{v_lane0_msg3.S-v_lane3_msg3.S}[2], [x_msg_addr], x_offs
+	ld4		{v_lane0_msg3.S-v_lane3_msg3.S}[3], [x_msg_addr], x_offs
+
+	// reverse for little endian
+	rev32		v_lane0_msg0.16b, v_lane0_msg0.16b
+	rev32		v_lane1_msg0.16b, v_lane1_msg0.16b
+	rev32		v_lane2_msg0.16b, v_lane2_msg0.16b
+	rev32		v_lane3_msg0.16b, v_lane3_msg0.16b
+
+	rev32		v_lane0_msg1.16b, v_lane0_msg1.16b
+	rev32		v_lane1_msg1.16b, v_lane1_msg1.16b
+	rev32		v_lane2_msg1.16b, v_lane2_msg1.16b
+	rev32		v_lane3_msg1.16b, v_lane3_msg1.16b
+
+	rev32		v_lane0_msg2.16b, v_lane0_msg2.16b
+	rev32		v_lane1_msg2.16b, v_lane1_msg2.16b
+	rev32		v_lane2_msg2.16b, v_lane2_msg2.16b
+	rev32		v_lane3_msg2.16b, v_lane3_msg2.16b
+
+	rev32		v_lane0_msg3.16b, v_lane0_msg3.16b
+	rev32		v_lane1_msg3.16b, v_lane1_msg3.16b
+	rev32		v_lane2_msg3.16b, v_lane2_msg3.16b
+	rev32		v_lane3_msg3.16b, v_lane3_msg3.16b
+
+	// rounds 0-3
+	ldr		q_key, [x_key_addr]
+	add		v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s
+	add		v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s
+	add		v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s
+	add		v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s
+
+	sha256h		q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+	sha256h		q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+	sha256h		q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+	sha256h		q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+	str		q_lane0_state1, [sp, 64]
+	str		q_lane1_state1, [sp, 80]
+	str		q_lane2_state1, [sp, 96]
+	str		q_lane3_state1, [sp, 112]
+
+	mov		x_offs, 64
+	mov		x_tmp, x_digest_addr
+	ld4		{v_lane0_tmp2.S-v_lane3_tmp2.S}[0], [x_tmp], x_offs
+	ld4		{v_lane0_tmp2.S-v_lane3_tmp2.S}[1], [x_tmp], x_offs
+	ld4		{v_lane0_tmp2.S-v_lane3_tmp2.S}[2], [x_tmp], x_offs
+	ld4		{v_lane0_tmp2.S-v_lane3_tmp2.S}[3], [x_tmp], x_offs
+
+	sha256h2	q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+	sha256h2	q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+	sha256h2	q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+	sha256h2	q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+	sha256su0	v_lane0_msg0.4s, v_lane0_msg1.4s
+	sha256su0	v_lane1_msg0.4s, v_lane1_msg1.4s
+	sha256su0	v_lane2_msg0.4s, v_lane2_msg1.4s
+	sha256su0	v_lane3_msg0.4s, v_lane3_msg1.4s
+
+	sha256su1	v_lane0_msg0.4s, v_lane0_msg2.4s, v_lane0_msg3.4s
+	sha256su1	v_lane1_msg0.4s, v_lane1_msg2.4s, v_lane1_msg3.4s
+	sha256su1	v_lane2_msg0.4s, v_lane2_msg2.4s, v_lane2_msg3.4s
+	sha256su1	v_lane3_msg0.4s, v_lane3_msg2.4s, v_lane3_msg3.4s
+
+	// rounds 4-7
+	ldr		q_key, [x_key_addr, 16]
+	add		v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s
+	add		v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s
+	add		v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s
+	add		v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s
+
+	mov		v_lane0_tmp2.16b, v_lane0_state0.16b
+	mov		v_lane1_tmp2.16b, v_lane1_state0.16b
+	mov		v_lane2_tmp2.16b, v_lane2_state0.16b
+	mov		v_lane3_tmp2.16b, v_lane3_state0.16b
+
+	sha256h		q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+	sha256h		q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+	sha256h		q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+	sha256h		q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+	sha256h2	q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+	sha256h2	q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+	sha256h2	q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+	sha256h2	q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+	sha256su0	v_lane0_msg1.4s, v_lane0_msg2.4s
+	sha256su0	v_lane1_msg1.4s, v_lane1_msg2.4s
+	sha256su0	v_lane2_msg1.4s, v_lane2_msg2.4s
+	sha256su0	v_lane3_msg1.4s, v_lane3_msg2.4s
+
+	sha256su1	v_lane0_msg1.4s, v_lane0_msg3.4s, v_lane0_msg0.4s
+	sha256su1	v_lane1_msg1.4s, v_lane1_msg3.4s, v_lane1_msg0.4s
+	sha256su1	v_lane2_msg1.4s, v_lane2_msg3.4s, v_lane2_msg0.4s
+	sha256su1	v_lane3_msg1.4s, v_lane3_msg3.4s, v_lane3_msg0.4s
+
+	// rounds 8-11
+	ldr		q_key, [x_key_addr, 32]
+	add		v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s
+	add		v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s
+	add		v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s
+	add		v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s
+
+	mov		v_lane0_tmp2.16b, v_lane0_state0.16b
+	mov		v_lane1_tmp2.16b, v_lane1_state0.16b
+	mov		v_lane2_tmp2.16b, v_lane2_state0.16b
+	mov		v_lane3_tmp2.16b, v_lane3_state0.16b
+
+	sha256h		q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+	sha256h		q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+	sha256h		q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+	sha256h		q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+	sha256h2	q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+	sha256h2	q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+	sha256h2	q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+	sha256h2	q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+	sha256su0	v_lane0_msg2.4s, v_lane0_msg3.4s
+	sha256su0	v_lane1_msg2.4s, v_lane1_msg3.4s
+	sha256su0	v_lane2_msg2.4s, v_lane2_msg3.4s
+	sha256su0	v_lane3_msg2.4s, v_lane3_msg3.4s
+
+	sha256su1	v_lane0_msg2.4s, v_lane0_msg0.4s, v_lane0_msg1.4s
+	sha256su1	v_lane1_msg2.4s, v_lane1_msg0.4s, v_lane1_msg1.4s
+	sha256su1	v_lane2_msg2.4s, v_lane2_msg0.4s, v_lane2_msg1.4s
+	sha256su1	v_lane3_msg2.4s, v_lane3_msg0.4s, v_lane3_msg1.4s
+
+	// rounds 12-15
+	ldr		q_key, [x_key_addr, 48]
+	add		v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s
+	add		v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s
+	add		v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s
+	add		v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s
+
+	mov		v_lane0_tmp2.16b, v_lane0_state0.16b
+	mov		v_lane1_tmp2.16b, v_lane1_state0.16b
+	mov		v_lane2_tmp2.16b, v_lane2_state0.16b
+	mov		v_lane3_tmp2.16b, v_lane3_state0.16b
+
+	sha256h		q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+	sha256h		q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+	sha256h		q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+	sha256h		q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+	sha256h2	q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+	sha256h2	q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+	sha256h2	q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+	sha256h2	q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+	sha256su0	v_lane0_msg3.4s, v_lane0_msg0.4s
+	sha256su0	v_lane1_msg3.4s, v_lane1_msg0.4s
+	sha256su0	v_lane2_msg3.4s, v_lane2_msg0.4s
+	sha256su0	v_lane3_msg3.4s, v_lane3_msg0.4s
+
+	sha256su1	v_lane0_msg3.4s, v_lane0_msg1.4s, v_lane0_msg2.4s
+	sha256su1	v_lane1_msg3.4s, v_lane1_msg1.4s, v_lane1_msg2.4s
+	sha256su1	v_lane2_msg3.4s, v_lane2_msg1.4s, v_lane2_msg2.4s
+	sha256su1	v_lane3_msg3.4s, v_lane3_msg1.4s, v_lane3_msg2.4s
+
+	// rounds 16-19
+	ldr		q_key, [x_key_addr, 64]
+	add		v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s
+	add		v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s
+	add		v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s
+	add		v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s
+
+	mov		v_lane0_tmp2.16b, v_lane0_state0.16b
+	mov		v_lane1_tmp2.16b, v_lane1_state0.16b
+	mov		v_lane2_tmp2.16b, v_lane2_state0.16b
+	mov		v_lane3_tmp2.16b, v_lane3_state0.16b
+
+	sha256h		q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+	sha256h		q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+	sha256h		q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+	sha256h		q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+	sha256h2	q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+	sha256h2	q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+	sha256h2	q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+	sha256h2	q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+	sha256su0	v_lane0_msg0.4s, v_lane0_msg1.4s
+	sha256su0	v_lane1_msg0.4s, v_lane1_msg1.4s
+	sha256su0	v_lane2_msg0.4s, v_lane2_msg1.4s
+	sha256su0	v_lane3_msg0.4s, v_lane3_msg1.4s
+
+	sha256su1	v_lane0_msg0.4s, v_lane0_msg2.4s, v_lane0_msg3.4s
+	sha256su1	v_lane1_msg0.4s, v_lane1_msg2.4s, v_lane1_msg3.4s
+	sha256su1	v_lane2_msg0.4s, v_lane2_msg2.4s, v_lane2_msg3.4s
+	sha256su1	v_lane3_msg0.4s, v_lane3_msg2.4s, v_lane3_msg3.4s
+
+	// rounds 20-23
+	ldr		q_key, [x_key_addr, 80]
+	add		v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s
+	add		v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s
+	add		v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s
+	add		v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s
+
+	mov		v_lane0_tmp2.16b, v_lane0_state0.16b
+	mov		v_lane1_tmp2.16b, v_lane1_state0.16b
+	mov		v_lane2_tmp2.16b, v_lane2_state0.16b
+	mov		v_lane3_tmp2.16b, v_lane3_state0.16b
+
+	sha256h		q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+	sha256h		q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+	sha256h		q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+	sha256h		q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+	sha256h2	q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+	sha256h2	q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+	sha256h2	q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+	sha256h2	q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+	sha256su0	v_lane0_msg1.4s, v_lane0_msg2.4s
+	sha256su0	v_lane1_msg1.4s, v_lane1_msg2.4s
+	sha256su0	v_lane2_msg1.4s, v_lane2_msg2.4s
+	sha256su0	v_lane3_msg1.4s, v_lane3_msg2.4s
+
+	sha256su1	v_lane0_msg1.4s, v_lane0_msg3.4s, v_lane0_msg0.4s
+	sha256su1	v_lane1_msg1.4s, v_lane1_msg3.4s, v_lane1_msg0.4s
+	sha256su1	v_lane2_msg1.4s, v_lane2_msg3.4s, v_lane2_msg0.4s
+	sha256su1	v_lane3_msg1.4s, v_lane3_msg3.4s, v_lane3_msg0.4s
+
+	// rounds 24-27
+	ldr		q_key, [x_key_addr, 96]
+	add		v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s
+	add		v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s
+	add		v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s
+	add		v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s
+
+	mov		v_lane0_tmp2.16b, v_lane0_state0.16b
+	mov		v_lane1_tmp2.16b, v_lane1_state0.16b
+	mov		v_lane2_tmp2.16b, v_lane2_state0.16b
+	mov		v_lane3_tmp2.16b, v_lane3_state0.16b
+
+	sha256h		q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+	sha256h		q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+	sha256h		q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+	sha256h		q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+	sha256h2	q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+	sha256h2	q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+	sha256h2	q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+	sha256h2	q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+	sha256su0	v_lane0_msg2.4s, v_lane0_msg3.4s
+	sha256su0	v_lane1_msg2.4s, v_lane1_msg3.4s
+	sha256su0	v_lane2_msg2.4s, v_lane2_msg3.4s
+	sha256su0	v_lane3_msg2.4s, v_lane3_msg3.4s
+
+	sha256su1	v_lane0_msg2.4s, v_lane0_msg0.4s, v_lane0_msg1.4s
+	sha256su1	v_lane1_msg2.4s, v_lane1_msg0.4s, v_lane1_msg1.4s
+	sha256su1	v_lane2_msg2.4s, v_lane2_msg0.4s, v_lane2_msg1.4s
+	sha256su1	v_lane3_msg2.4s, v_lane3_msg0.4s, v_lane3_msg1.4s
+
+	// rounds 28-31
+	ldr		q_key, [x_key_addr, 112]
+	add		v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s
+	add		v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s
+	add		v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s
+	add		v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s
+
+	mov		v_lane0_tmp2.16b, v_lane0_state0.16b
+	mov		v_lane1_tmp2.16b, v_lane1_state0.16b
+	mov		v_lane2_tmp2.16b, v_lane2_state0.16b
+	mov		v_lane3_tmp2.16b, v_lane3_state0.16b
+
+	sha256h		q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+	sha256h		q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+	sha256h		q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+	sha256h		q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+	sha256h2	q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+	sha256h2	q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+	sha256h2	q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+	sha256h2	q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+	sha256su0	v_lane0_msg3.4s, v_lane0_msg0.4s
+	sha256su0	v_lane1_msg3.4s, v_lane1_msg0.4s
+	sha256su0	v_lane2_msg3.4s, v_lane2_msg0.4s
+	sha256su0	v_lane3_msg3.4s, v_lane3_msg0.4s
+
+	sha256su1	v_lane0_msg3.4s, v_lane0_msg1.4s, v_lane0_msg2.4s
+	sha256su1	v_lane1_msg3.4s, v_lane1_msg1.4s, v_lane1_msg2.4s
+	sha256su1	v_lane2_msg3.4s, v_lane2_msg1.4s, v_lane2_msg2.4s
+	sha256su1	v_lane3_msg3.4s, v_lane3_msg1.4s, v_lane3_msg2.4s
+
+	// rounds 32-35
+	ldr		q_key, [x_key_addr, 128]
+	add		v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s
+	add		v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s
+	add		v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s
+	add		v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s
+
+	mov		v_lane0_tmp2.16b, v_lane0_state0.16b
+	mov		v_lane1_tmp2.16b, v_lane1_state0.16b
+	mov		v_lane2_tmp2.16b, v_lane2_state0.16b
+	mov		v_lane3_tmp2.16b, v_lane3_state0.16b
+
+	sha256h		q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+	sha256h		q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+	sha256h		q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+	sha256h		q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+	sha256h2	q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+	sha256h2	q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+	sha256h2	q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+	sha256h2	q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+	sha256su0	v_lane0_msg0.4s, v_lane0_msg1.4s
+	sha256su0	v_lane1_msg0.4s, v_lane1_msg1.4s
+	sha256su0	v_lane2_msg0.4s, v_lane2_msg1.4s
+	sha256su0	v_lane3_msg0.4s, v_lane3_msg1.4s
+
+	sha256su1	v_lane0_msg0.4s, v_lane0_msg2.4s, v_lane0_msg3.4s
+	sha256su1	v_lane1_msg0.4s, v_lane1_msg2.4s, v_lane1_msg3.4s
+	sha256su1	v_lane2_msg0.4s, v_lane2_msg2.4s, v_lane2_msg3.4s
+	sha256su1	v_lane3_msg0.4s, v_lane3_msg2.4s, v_lane3_msg3.4s
+
+	// rounds 36-39
+	ldr		q_key, [x_key_addr, 144]
+	add		v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s
+	add		v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s
+	add		v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s
+	add		v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s
+
+	mov		v_lane0_tmp2.16b, v_lane0_state0.16b
+	mov		v_lane1_tmp2.16b, v_lane1_state0.16b
+	mov		v_lane2_tmp2.16b, v_lane2_state0.16b
+	mov		v_lane3_tmp2.16b, v_lane3_state0.16b
+
+	sha256h		q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+	sha256h		q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+	sha256h		q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+	sha256h		q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+	sha256h2	q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+	sha256h2	q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+	sha256h2	q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+	sha256h2	q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+	sha256su0	v_lane0_msg1.4s, v_lane0_msg2.4s
+	sha256su0	v_lane1_msg1.4s, v_lane1_msg2.4s
+	sha256su0	v_lane2_msg1.4s, v_lane2_msg2.4s
+	sha256su0	v_lane3_msg1.4s, v_lane3_msg2.4s
+
+	sha256su1	v_lane0_msg1.4s, v_lane0_msg3.4s, v_lane0_msg0.4s
+	sha256su1	v_lane1_msg1.4s, v_lane1_msg3.4s, v_lane1_msg0.4s
+	sha256su1	v_lane2_msg1.4s, v_lane2_msg3.4s, v_lane2_msg0.4s
+	sha256su1	v_lane3_msg1.4s, v_lane3_msg3.4s, v_lane3_msg0.4s
+
+	// rounds 40-43
+	ldr		q_key, [x_key_addr, 160]
+	add		v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s
+	add		v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s
+	add		v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s
+	add		v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s
+
+	mov		v_lane0_tmp2.16b, v_lane0_state0.16b
+	mov		v_lane1_tmp2.16b, v_lane1_state0.16b
+	mov		v_lane2_tmp2.16b, v_lane2_state0.16b
+	mov		v_lane3_tmp2.16b, v_lane3_state0.16b
+
+	sha256h		q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+	sha256h		q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+	sha256h		q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+	sha256h		q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+	sha256h2	q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+	sha256h2	q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+	sha256h2	q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+	sha256h2	q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+	sha256su0	v_lane0_msg2.4s, v_lane0_msg3.4s
+	sha256su0	v_lane1_msg2.4s, v_lane1_msg3.4s
+	sha256su0	v_lane2_msg2.4s, v_lane2_msg3.4s
+	sha256su0	v_lane3_msg2.4s, v_lane3_msg3.4s
+
+	sha256su1	v_lane0_msg2.4s, v_lane0_msg0.4s, v_lane0_msg1.4s
+	sha256su1	v_lane1_msg2.4s, v_lane1_msg0.4s, v_lane1_msg1.4s
+	sha256su1	v_lane2_msg2.4s, v_lane2_msg0.4s, v_lane2_msg1.4s
+	sha256su1	v_lane3_msg2.4s, v_lane3_msg0.4s, v_lane3_msg1.4s
+
+	// rounds 44-47
+	ldr		q_key, [x_key_addr, 176]
+	add		v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s
+	add		v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s
+	add		v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s
+	add		v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s
+
+	mov		v_lane0_tmp2.16b, v_lane0_state0.16b
+	mov		v_lane1_tmp2.16b, v_lane1_state0.16b
+	mov		v_lane2_tmp2.16b, v_lane2_state0.16b
+	mov		v_lane3_tmp2.16b, v_lane3_state0.16b
+
+	sha256h		q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+	sha256h		q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+	sha256h		q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+	sha256h		q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+	sha256h2	q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+	sha256h2	q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+	sha256h2	q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+	sha256h2	q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+	sha256su0	v_lane0_msg3.4s, v_lane0_msg0.4s
+	sha256su0	v_lane1_msg3.4s, v_lane1_msg0.4s
+	sha256su0	v_lane2_msg3.4s, v_lane2_msg0.4s
+	sha256su0	v_lane3_msg3.4s, v_lane3_msg0.4s
+
+	sha256su1	v_lane0_msg3.4s, v_lane0_msg1.4s, v_lane0_msg2.4s
+	sha256su1	v_lane1_msg3.4s, v_lane1_msg1.4s, v_lane1_msg2.4s
+	sha256su1	v_lane2_msg3.4s, v_lane2_msg1.4s, v_lane2_msg2.4s
+	sha256su1	v_lane3_msg3.4s, v_lane3_msg1.4s, v_lane3_msg2.4s
+
+	// rounds 48-51
+	ldr		q_key, [x_key_addr, 192]
+	add		v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s
+	add		v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s
+	add		v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s
+	add		v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s
+
+	mov		v_lane0_tmp2.16b, v_lane0_state0.16b
+	mov		v_lane1_tmp2.16b, v_lane1_state0.16b
+	mov		v_lane2_tmp2.16b, v_lane2_state0.16b
+	mov		v_lane3_tmp2.16b, v_lane3_state0.16b
+
+	sha256h		q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+	sha256h		q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+	sha256h		q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+	sha256h		q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+	sha256h2	q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+	sha256h2	q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+	sha256h2	q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+	sha256h2	q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+	// rounds 52-55
+	ldr		q_key, [x_key_addr, 208]
+	add		v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s
+	add		v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s
+	add		v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s
+	add		v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s
+
+	mov		v_lane0_tmp2.16b, v_lane0_state0.16b
+	mov		v_lane1_tmp2.16b, v_lane1_state0.16b
+	mov		v_lane2_tmp2.16b, v_lane2_state0.16b
+	mov		v_lane3_tmp2.16b, v_lane3_state0.16b
+
+	sha256h		q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+	sha256h		q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+	sha256h		q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+	sha256h		q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+	sha256h2	q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+	sha256h2	q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+	sha256h2	q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+	sha256h2	q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+	// rounds 56-59
+	ldr		q_key, [x_key_addr, 224]
+	add		v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s
+	add		v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s
+	add		v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s
+	add		v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s
+
+	mov		v_lane0_tmp2.16b, v_lane0_state0.16b
+	mov		v_lane1_tmp2.16b, v_lane1_state0.16b
+	mov		v_lane2_tmp2.16b, v_lane2_state0.16b
+	mov		v_lane3_tmp2.16b, v_lane3_state0.16b
+
+	sha256h		q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+	sha256h		q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+	sha256h		q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+	sha256h		q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+	sha256h2	q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+	sha256h2	q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+	sha256h2	q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+	sha256h2	q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+	// rounds 60-63
+	ldr		q_key, [x_key_addr, 240]
+	add		v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s
+	add		v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s
+	add		v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s
+	add		v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s
+
+	mov		v_lane0_tmp2.16b, v_lane0_state0.16b
+	mov		v_lane1_tmp2.16b, v_lane1_state0.16b
+	mov		v_lane2_tmp2.16b, v_lane2_state0.16b
+	mov		v_lane3_tmp2.16b, v_lane3_state0.16b
+
+	sha256h		q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+	sha256h		q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+	sha256h		q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+	sha256h		q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+	sha256h2	q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+	sha256h2	q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+	sha256h2	q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+	sha256h2	q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+	mov		x_offs, 64
+	mov		x_tmp, x_digest_addr
+	ld4		{v_lane0_tmp0.S-v_lane3_tmp0.S}[0], [x_tmp], x_offs
+	ld4		{v_lane0_tmp0.S-v_lane3_tmp0.S}[1], [x_tmp], x_offs
+	ld4		{v_lane0_tmp0.S-v_lane3_tmp0.S}[2], [x_tmp], x_offs
+	ld4		{v_lane0_tmp0.S-v_lane3_tmp0.S}[3], [x_tmp], x_offs
+
+	add		v_lane0_state0.4s, v_lane0_tmp0.4s, v_lane0_state0.4s
+	add		v_lane1_state0.4s, v_lane1_tmp0.4s, v_lane1_state0.4s
+	add		v_lane2_state0.4s, v_lane2_tmp0.4s, v_lane2_state0.4s
+	add		v_lane3_state0.4s, v_lane3_tmp0.4s, v_lane3_state0.4s
+
+	mov		x_offs, 64
+	mov		x_tmp, x_digest_addr
+	st4		{v_lane0_state0.S-v_lane3_state0.S}[0], [x_tmp], x_offs
+	st4		{v_lane0_state0.S-v_lane3_state0.S}[1], [x_tmp], x_offs
+	st4		{v_lane0_state0.S-v_lane3_state0.S}[2], [x_tmp], x_offs
+	st4		{v_lane0_state0.S-v_lane3_state0.S}[3], [x_tmp], x_offs
+
+	ldp		q_lane0_tmp2, q_lane1_tmp2, [sp, 64]
+	ldp		q_lane2_tmp2, q_lane3_tmp2, [sp, 96]
+
+	add		v_lane0_state1.4s, v_lane0_tmp2.4s, v_lane0_state1.4s
+	add		v_lane1_state1.4s, v_lane1_tmp2.4s, v_lane1_state1.4s
+	add		v_lane2_state1.4s, v_lane2_tmp2.4s, v_lane2_state1.4s
+	add		v_lane3_state1.4s, v_lane3_tmp2.4s, v_lane3_state1.4s
+
+	mov		x_offs, 64
+	add		x_tmp, x_digest_addr, 256
+	st4		{v_lane0_state1.S-v_lane3_state1.S}[0], [x_tmp], x_offs
+	st4		{v_lane0_state1.S-v_lane3_state1.S}[1], [x_tmp], x_offs
+	st4		{v_lane0_state1.S-v_lane3_state1.S}[2], [x_tmp], x_offs
+	st4		{v_lane0_state1.S-v_lane3_state1.S}[3], [x_tmp], x_offs
+
+	add		x_digest_addr, x_digest_addr, 16
+	add		x_lane_offs, x_lane_offs, 4
+	cmp		x_lane_offs, 16
+	bne		.lane_loop
+
+	add		x_input_data, x_input_data, 1024
+	cmp		x_input_data, x_input_data_end
+	bne		.start_loop
+
+	ldp		d10, d11, [sp, 16]
+	ldp		d12, d13, [sp, 32]
+	ldp		d14, d15, [sp, 48]
+	ldp		d8, d9, [sp], 192
+.exit:
+	ret
+	.size	mh_sha256_block_ce, .-mh_sha256_block_ce
+
+	.section	.rodata
+	.align	4
+	.set	.key_addr,. + 0
+	.type	K, %object
+	.size	K, 256
+K:
+	.word	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+	.word	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+	.word	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+	.word	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+	.word	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+	.word	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+	.word	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+	.word	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c
new file mode 100644
index 000000000..c42333ed5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c
@@ -0,0 +1,53 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <string.h>
+#include "mh_sha256_internal.h"
+
+void mh_sha256_block_ce(const uint8_t * input_data,
+			uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+			uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+/***************mh_sha256_update***********/
+// mh_sha256_update_ce.c
+#define MH_SHA256_UPDATE_FUNCTION	mh_sha256_update_ce
+#define MH_SHA256_BLOCK_FUNCTION	mh_sha256_block_ce
+#include "mh_sha256_update_base.c"
+#undef MH_SHA256_UPDATE_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+/***************mh_sha256_finalize AND mh_sha256_tail***********/
+// mh_sha256_tail is used to calculate the last incomplete src data block
+// mh_sha256_finalize is a mh_sha256_ctx wrapper of mh_sha256_tail
+// mh_sha256_finalize_ce.c and mh_sha256_tail_ce.c
+#define MH_SHA256_FINALIZE_FUNCTION	mh_sha256_finalize_ce
+#define MH_SHA256_TAIL_FUNCTION		mh_sha256_tail_ce
+#define MH_SHA256_BLOCK_FUNCTION	mh_sha256_block_ce
+#include "mh_sha256_finalize_base.c"
+#undef MH_SHA256_FINALIZE_FUNCTION
+#undef MH_SHA256_TAIL_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_multibinary.S b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_multibinary.S
new file mode 100644
index 000000000..54eece175
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_multibinary.S
@@ -0,0 +1,35 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include "aarch64_multibinary.h"
+
+
+mbin_interface mh_sha256_update
+mbin_interface mh_sha256_finalize
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256.c
new file mode 100644
index 000000000..242c3e218
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256.c
@@ -0,0 +1,143 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha256_internal.h"
+
+int mh_sha256_init(struct mh_sha256_ctx *ctx)
+{
+	uint32_t(*mh_sha256_segs_digests)[HASH_SEGS];
+	uint32_t i;
+
+	if (ctx == NULL)
+		return MH_SHA256_CTX_ERROR_NULL;
+
+	memset(ctx, 0, sizeof(*ctx));
+
+	mh_sha256_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha256_interim_digests;
+	for (i = 0; i < HASH_SEGS; i++) {
+		mh_sha256_segs_digests[0][i] = MH_SHA256_H0;
+		mh_sha256_segs_digests[1][i] = MH_SHA256_H1;
+		mh_sha256_segs_digests[2][i] = MH_SHA256_H2;
+		mh_sha256_segs_digests[3][i] = MH_SHA256_H3;
+		mh_sha256_segs_digests[4][i] = MH_SHA256_H4;
+		mh_sha256_segs_digests[5][i] = MH_SHA256_H5;
+		mh_sha256_segs_digests[6][i] = MH_SHA256_H6;
+		mh_sha256_segs_digests[7][i] = MH_SHA256_H7;
+	}
+
+	return MH_SHA256_CTX_ERROR_NONE;
+}
+
+#if (!defined(NOARCH)) && (defined(__i386__) || defined(__x86_64__) \
+	|| defined( _M_X64) || defined(_M_IX86))
+/***************mh_sha256_update***********/
+// mh_sha256_update_sse.c
+#define MH_SHA256_UPDATE_FUNCTION	mh_sha256_update_sse
+#define MH_SHA256_BLOCK_FUNCTION	mh_sha256_block_sse
+#include "mh_sha256_update_base.c"
+#undef MH_SHA256_UPDATE_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+// mh_sha256_update_avx.c
+#define MH_SHA256_UPDATE_FUNCTION	mh_sha256_update_avx
+#define MH_SHA256_BLOCK_FUNCTION	mh_sha256_block_avx
+#include "mh_sha256_update_base.c"
+#undef MH_SHA256_UPDATE_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+// mh_sha256_update_avx2.c
+#define MH_SHA256_UPDATE_FUNCTION	mh_sha256_update_avx2
+#define MH_SHA256_BLOCK_FUNCTION	mh_sha256_block_avx2
+#include "mh_sha256_update_base.c"
+#undef MH_SHA256_UPDATE_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+/***************mh_sha256_finalize AND mh_sha256_tail***********/
+// mh_sha256_tail is used to calculate the last incomplete src data block
+// mh_sha256_finalize is a mh_sha256_ctx wrapper of mh_sha256_tail
+
+// mh_sha256_finalize_sse.c and mh_sha256_tail_sse.c
+#define MH_SHA256_FINALIZE_FUNCTION	mh_sha256_finalize_sse
+#define MH_SHA256_TAIL_FUNCTION		mh_sha256_tail_sse
+#define MH_SHA256_BLOCK_FUNCTION	mh_sha256_block_sse
+#include "mh_sha256_finalize_base.c"
+#undef MH_SHA256_FINALIZE_FUNCTION
+#undef MH_SHA256_TAIL_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+// mh_sha256_finalize_avx.c and mh_sha256_tail_avx.c
+#define MH_SHA256_FINALIZE_FUNCTION	mh_sha256_finalize_avx
+#define MH_SHA256_TAIL_FUNCTION		mh_sha256_tail_avx
+#define MH_SHA256_BLOCK_FUNCTION	mh_sha256_block_avx
+#include "mh_sha256_finalize_base.c"
+#undef MH_SHA256_FINALIZE_FUNCTION
+#undef MH_SHA256_TAIL_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+// mh_sha256_finalize_avx2.c and mh_sha256_tail_avx2.c
+#define MH_SHA256_FINALIZE_FUNCTION	mh_sha256_finalize_avx2
+#define MH_SHA256_TAIL_FUNCTION		mh_sha256_tail_avx2
+#define MH_SHA256_BLOCK_FUNCTION	mh_sha256_block_avx2
+#include "mh_sha256_finalize_base.c"
+#undef MH_SHA256_FINALIZE_FUNCTION
+#undef MH_SHA256_TAIL_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+/***************version info***********/
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+// Version info
+struct slver mh_sha256_init_slver_000002b1;
+struct slver mh_sha256_init_slver = { 0x02b1, 0x00, 0x00 };
+
+// mh_sha256_update version info
+struct slver mh_sha256_update_sse_slver_000002b4;
+struct slver mh_sha256_update_sse_slver = { 0x02b4, 0x00, 0x00 };
+
+struct slver mh_sha256_update_avx_slver_020002b6;
+struct slver mh_sha256_update_avx_slver = { 0x02b6, 0x00, 0x02 };
+
+struct slver mh_sha256_update_avx2_slver_040002b8;
+struct slver mh_sha256_update_avx2_slver = { 0x02b8, 0x00, 0x04 };
+
+// mh_sha256_finalize version info
+struct slver mh_sha256_finalize_sse_slver_000002b5;
+struct slver mh_sha256_finalize_sse_slver = { 0x02b5, 0x00, 0x00 };
+
+struct slver mh_sha256_finalize_avx_slver_020002b7;
+struct slver mh_sha256_finalize_avx_slver = { 0x02b7, 0x00, 0x02 };
+
+struct slver mh_sha256_finalize_avx2_slver_040002b9;
+struct slver mh_sha256_finalize_avx2_slver = { 0x02b9, 0x00, 0x04 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_avx512.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_avx512.c
new file mode 100644
index 000000000..35fb0fbad
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_avx512.c
@@ -0,0 +1,70 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha256_internal.h"
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+/***************mh_sha256_update***********/
+// mh_sha256_update_avx512.c
+#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_avx512
+#define MH_SHA256_BLOCK_FUNCTION	mh_sha256_block_avx512
+#include "mh_sha256_update_base.c"
+#undef MH_SHA256_UPDATE_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+/***************mh_sha256_finalize AND mh_sha256_tail***********/
+// mh_sha256_tail is used to calculate the last incomplete src data block
+// mh_sha256_finalize is a mh_sha256_ctx wrapper of mh_sha256_tail
+// mh_sha256_finalize_avx512.c and mh_sha256_tail_avx512.c
+#define MH_SHA256_FINALIZE_FUNCTION	mh_sha256_finalize_avx512
+#define MH_SHA256_TAIL_FUNCTION		mh_sha256_tail_avx512
+#define MH_SHA256_BLOCK_FUNCTION		mh_sha256_block_avx512
+#include "mh_sha256_finalize_base.c"
+#undef MH_SHA256_FINALIZE_FUNCTION
+#undef MH_SHA256_TAIL_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+/***************version info***********/
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+
+// mh_sha256_update version info
+struct slver mh_sha256_update_avx512_slver_060002bc;
+struct slver mh_sha256_update_avx512_slver = { 0x02bc, 0x00, 0x06 };
+
+// mh_sha256_finalize version info
+struct slver mh_sha256_finalize_avx512_slver_060002bd;
+struct slver mh_sha256_finalize_avx512_slver = { 0x02bd, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_base_aliases.c
new file mode 100644
index 000000000..343ffb024
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_base_aliases.c
@@ -0,0 +1,40 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "mh_sha256_internal.h"
+#include <string.h>
+int mh_sha256_update(struct mh_sha256_ctx *ctx, const void *buffer, uint32_t len)
+{
+	return mh_sha256_update_base(ctx, buffer, len);
+
+}
+
+int mh_sha256_finalize(struct mh_sha256_ctx *ctx, void *mh_sha256_digest)
+{
+	return mh_sha256_finalize_base(ctx, mh_sha256_digest);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm
new file mode 100644
index 000000000..c2eff350d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm
@@ -0,0 +1,557 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA256 using AVX
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp1  r10
+ %define tmp2  r11
+ %define tmp3  r12		; must be saved and restored
+ %define tmp4  r13		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r10
+ %define arg5   r11
+ %define tmp1   r12		; must be saved and restored
+ %define tmp2   r13		; must be saved and restored
+ %define tmp3   r14		; must be saved and restored
+ %define tmp4   r15		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define return rax
+
+ %define stack_size  10*16 + 7*8		; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	save_reg	rsi,  10*16 + 5*8
+	end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	mov	rsi,  [rsp + 10*16 + 5*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops 		arg3
+;variables of mh_sha256
+%define mh_in_p  	arg0
+%define mh_digests_p 	arg1
+%define mh_data_p	arg2
+%define mh_segs  	tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE	tmp2
+%define FRAMESZ 	4*8*16		;BYTES*DWORDS*SEGS
+
+; Common definitions
+%define ROUND	tmp4
+%define TBL	tmp5
+
+%define pref	tmp3
+%macro PREFETCH_X 1
+%define %%mem  %1
+	prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS  vmovups
+
+%define SZ	4
+%define SZ4	4*SZ
+%define ROUNDS 64*SZ4
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1  xmm14
+%define TMP xmm15
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpslld	%%tmp, %%reg, (32-(%%imm))
+	vpsrld	%%reg, %%reg, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	vpslld	%%tmp, %%src, (32-(%%imm))
+	vpsrld	%%reg, %%src, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+	PRORD	%1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+	PRORD_nd	%1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15_R 3
+%define %%T1 %1
+%define %%i  %2
+%define %%data %3
+
+	PRORD_nd	a0, e, (11-6)	; sig1: a0 = (e >> 5)
+
+	vpxor	a2, f, g	; ch: a2 = f^g
+	vpand	a2, e		; ch: a2 = (f^g)&e
+	vpxor	a2, g		; a2 = ch
+
+	PRORD_nd	a1, e, 25		; sig1: a1 = (e >> 25)
+	vmovdqa	%%T1, [SZ4*(%%i&0xf) + %%data]
+	vpaddd	%%T1, %%T1, [TBL + ROUND]	; T1 = W + K
+	vpxor	a0, a0, e	; sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		; sig1: a0 = (e >> 6) ^ (e >> 11)
+	vpaddd	h, h, a2	; h = h + ch
+	PRORD_nd	a2, a, (13-2)	; sig0: a2 = (a >> 11)
+	vpaddd	h, h, %%T1	; h = h + ch + W + K
+	vpxor	a0, a0, a1	; a0 = sigma1
+	PRORD_nd	a1, a, 22	; sig0: a1 = (a >> 22)
+	vpxor	%%T1, a, c	; maj: T1 = a^c
+	add	ROUND, SZ4	; ROUND++
+	vpand	%%T1, %%T1, b	; maj: T1 = (a^c)&b
+	vpaddd	h, h, a0
+
+	vpaddd	d, d, h
+
+	vpxor	a2, a2, a	; sig0: a2 = a ^ (a >> 11)
+	PRORD	a2, 2		; sig0: a2 = (a >> 2) ^ (a >> 13)
+	vpxor	a2, a2, a1	; a2 = sig0
+	vpand	a1, a, c	; maj: a1 = a&c
+	vpor	a1, a1, %%T1	; a1 = maj
+	vpaddd	h, h, a1	; h = h + ch + W + K + maj
+	vpaddd	h, h, a2	; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15_W 3
+%define %%T1 %1
+%define %%i  %2
+%define %%data %3
+
+	PRORD_nd	a0, e, (11-6)	; sig1: a0 = (e >> 5)
+
+	vpxor	a2, f, g	; ch: a2 = f^g
+	vpand	a2, e		; ch: a2 = (f^g)&e
+	vpxor	a2, g		; a2 = ch
+
+	PRORD_nd	a1, e, 25		; sig1: a1 = (e >> 25)
+	vmovdqa	[SZ4*(%%i&0xf) + %%data], %%T1
+	vpaddd	%%T1, %%T1, [TBL + ROUND]	; T1 = W + K
+	vpxor	a0, a0, e	; sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		; sig1: a0 = (e >> 6) ^ (e >> 11)
+	vpaddd	h, h, a2	; h = h + ch
+	PRORD_nd	a2, a, (13-2)	; sig0: a2 = (a >> 11)
+	vpaddd	h, h, %%T1	; h = h + ch + W + K
+	vpxor	a0, a0, a1	; a0 = sigma1
+	PRORD_nd	a1, a, 22	; sig0: a1 = (a >> 22)
+	vpxor	%%T1, a, c	; maj: T1 = a^c
+	add	ROUND, SZ4	; ROUND++
+	vpand	%%T1, %%T1, b	; maj: T1 = (a^c)&b
+	vpaddd	h, h, a0
+
+	vpaddd	d, d, h
+
+	vpxor	a2, a2, a	; sig0: a2 = a ^ (a >> 11)
+	PRORD	a2, 2		; sig0: a2 = (a >> 2) ^ (a >> 13)
+	vpxor	a2, a2, a1	; a2 = sig0
+	vpand	a1, a, c	; maj: a1 = a&c
+	vpor	a1, a1, %%T1	; a1 = maj
+	vpaddd	h, h, a1	; h = h + ch + W + K + maj
+	vpaddd	h, h, a2	; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 3
+%define %%T1 %1
+%define %%i  %2
+%define %%data %3
+
+	vmovdqa	%%T1, [SZ4*((%%i-15)&0xf) + %%data]
+	vmovdqa	a1, [SZ4*((%%i-2)&0xf) + %%data]
+	vmovdqa	a0, %%T1
+	PRORD	%%T1, 18-7
+	vmovdqa	a2, a1
+	PRORD	a1, 19-17
+	vpxor	%%T1, %%T1, a0
+	PRORD	%%T1, 7
+	vpxor	a1, a1, a2
+	PRORD	a1, 17
+	vpsrld	a0, a0, 3
+	vpxor	%%T1, %%T1, a0
+	vpsrld	a2, a2, 10
+	vpxor	a1, a1, a2
+	vpaddd	%%T1, %%T1, [SZ4*((%%i-16)&0xf) + %%data]
+	vpaddd	a1, a1, [SZ4*((%%i-7)&0xf) + %%data]
+	vpaddd	%%T1, %%T1, a1
+
+	ROUND_00_15_W %%T1, %%i, %%data
+%endm
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a  | b  |  c | ...|  p | (16)
+; h0 | h0 | h0 | ...| h0 |    | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 |    | Ba| Bb | Bc |...| Bp |
+; ....
+; h7 | h7 | h7 | ...| h7 |    | Ha| Hb | Hc |...| Hp |
+
+align 32
+
+;void mh_sha256_block_avx(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+;		uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number  of 1KB blocks
+;
+mk_global mh_sha256_block_avx, function, internal
+func(mh_sha256_block_avx)
+	endbranch
+	FUNC_SAVE
+	; save rsp
+	mov	RSP_SAVE, rsp
+
+	cmp	loops, 0
+	jle	.return
+
+	; leave enough space to store segs_digests
+	sub     rsp, FRAMESZ
+	; align rsp to 16 Bytes needed by avx
+	and	rsp, ~0x0F
+	lea	TBL,[TABLE]
+
+ %assign I 0					; copy segs_digests into stack
+ %rep 8
+	VMOVPS  a, [mh_digests_p + I*64 + 16*0]
+	VMOVPS  b, [mh_digests_p + I*64 + 16*1]
+	VMOVPS  c, [mh_digests_p + I*64 + 16*2]
+	VMOVPS  d, [mh_digests_p + I*64 + 16*3]
+
+	vmovdqa [rsp + I*64 + 16*0], a
+	vmovdqa [rsp + I*64 + 16*1], b
+	vmovdqa [rsp + I*64 + 16*2], c
+	vmovdqa [rsp + I*64 + 16*3], d
+ %assign I (I+1)
+ %endrep
+
+.block_loop:
+	;transform to big-endian data and store on aligned_frame
+	vmovdqa  TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+	;transform input data from DWORD*16_SEGS*8 to DWORD*4_SEGS*8*4
+ %assign I 0
+ %rep 16
+	VMOVPS   TT0,[mh_in_p + I*64+0*16]
+	VMOVPS   TT1,[mh_in_p + I*64+1*16]
+	VMOVPS   TT2,[mh_in_p + I*64+2*16]
+	VMOVPS   TT3,[mh_in_p + I*64+3*16]
+
+	vpshufb  TT0, TMP
+	vmovdqa  [mh_data_p +(I)*16 +0*256],TT0
+	vpshufb  TT1, TMP
+	vmovdqa  [mh_data_p +(I)*16 +1*256],TT1
+	vpshufb  TT2, TMP
+	vmovdqa  [mh_data_p +(I)*16 +2*256],TT2
+	vpshufb  TT3, TMP
+	vmovdqa  [mh_data_p +(I)*16 +3*256],TT3
+ %assign I (I+1)
+ %endrep
+
+	mov	mh_segs, 0			;start from the first 4 segments
+	mov	pref, 1024			;avoid prefetch repeadtedly
+ .segs_loop:
+	xor	ROUND, ROUND
+	;; Initialize digests
+	vmovdqa  a, [rsp + 0*64 + mh_segs]
+	vmovdqa  b, [rsp + 1*64 + mh_segs]
+	vmovdqa  c, [rsp + 2*64 + mh_segs]
+	vmovdqa  d, [rsp + 3*64 + mh_segs]
+	vmovdqa  e, [rsp + 4*64 + mh_segs]
+	vmovdqa  f, [rsp + 5*64 + mh_segs]
+	vmovdqa  g, [rsp + 6*64 + mh_segs]
+	vmovdqa  h, [rsp + 7*64 + mh_segs]
+
+  %assign i 0
+  %rep 4
+	ROUND_00_15_R	TT0, (i*4+0), mh_data_p
+	ROUND_00_15_R	TT1, (i*4+1), mh_data_p
+	ROUND_00_15_R	TT2, (i*4+2), mh_data_p
+	ROUND_00_15_R	TT3, (i*4+3), mh_data_p
+  %assign i (i+1)
+  %endrep
+	PREFETCH_X [mh_in_p + pref+128*0]
+
+  %assign i 16
+  %rep 48
+	%if i = 48
+		PREFETCH_X [mh_in_p + pref+128*1]
+	%endif
+	ROUND_16_XX	T1, i, mh_data_p
+  %assign i (i+1)
+  %endrep
+
+	;; add old digest
+	vpaddd	a, a, [rsp + 0*64 + mh_segs]
+	vpaddd	b, b, [rsp + 1*64 + mh_segs]
+	vpaddd	c, c, [rsp + 2*64 + mh_segs]
+	vpaddd	d, d, [rsp + 3*64 + mh_segs]
+	vpaddd	e, e, [rsp + 4*64 + mh_segs]
+	vpaddd	f, f, [rsp + 5*64 + mh_segs]
+	vpaddd	g, g, [rsp + 6*64 + mh_segs]
+	vpaddd	h, h, [rsp + 7*64 + mh_segs]
+
+	; write out digests
+	vmovdqa  [rsp + 0*64 + mh_segs], a
+	vmovdqa  [rsp + 1*64 + mh_segs], b
+	vmovdqa  [rsp + 2*64 + mh_segs], c
+	vmovdqa  [rsp + 3*64 + mh_segs], d
+	vmovdqa  [rsp + 4*64 + mh_segs], e
+	vmovdqa  [rsp + 5*64 + mh_segs], f
+	vmovdqa  [rsp + 6*64 + mh_segs], g
+	vmovdqa  [rsp + 7*64 + mh_segs], h
+
+	add	pref,      256
+	add	mh_data_p, 256
+	add 	mh_segs,   16
+	cmp	mh_segs,   64
+	jc 	.segs_loop
+
+	sub	mh_data_p, (1024)
+	add 	mh_in_p,   (1024)
+	sub     loops,     1
+	jne     .block_loop
+
+ %assign I 0					; copy segs_digests back to mh_digests_p
+ %rep 8
+	vmovdqa a, [rsp + I*64 + 16*0]
+	vmovdqa b, [rsp + I*64 + 16*1]
+	vmovdqa c, [rsp + I*64 + 16*2]
+	vmovdqa d, [rsp + I*64 + 16*3]
+
+	VMOVPS  [mh_digests_p + I*64 + 16*0], a
+	VMOVPS  [mh_digests_p + I*64 + 16*1], b
+	VMOVPS  [mh_digests_p + I*64 + 16*2], c
+	VMOVPS  [mh_digests_p + I*64 + 16*3], d
+ %assign I (I+1)
+ %endrep
+	mov	rsp, RSP_SAVE			; restore rsp
+
+.return:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data align=64
+
+align 64
+TABLE:
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx2.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx2.asm
new file mode 100644
index 000000000..c2b3f2c59
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx2.asm
@@ -0,0 +1,616 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA256 using AVX-2
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp1  r10
+ %define tmp2  r11
+ %define tmp3  r12		; must be saved and restored
+ %define tmp4  r13		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r10
+ %define arg5   r11
+ %define tmp1   r12		; must be saved and restored
+ %define tmp2   r13		; must be saved and restored
+ %define tmp3   r14		; must be saved and restored
+ %define tmp4   r15		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define return rax
+
+ %define stack_size  10*16 + 7*8		; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	save_reg	rsi,  10*16 + 5*8
+	end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	mov	rsi,  [rsp + 10*16 + 5*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops 		arg3
+;variables of mh_sha256
+%define mh_in_p  	arg0
+%define mh_digests_p 	arg1
+%define mh_data_p	arg2
+%define mh_segs  	tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE	tmp2
+%define FRAMESZ 	4*8*16		;BYTES*DWORDS*SEGS
+
+; Common definitions
+%define ROUND	tmp4
+%define TBL	tmp5
+
+%define pref	tmp3
+%macro PREFETCH_X 1
+%define %%mem  %1
+	prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS  vmovups
+
+%define SZ	4
+%define SZ8	8*SZ
+%define ROUNDS 64*SZ8
+
+%define a ymm0
+%define b ymm1
+%define c ymm2
+%define d ymm3
+%define e ymm4
+%define f ymm5
+%define g ymm6
+%define h ymm7
+
+%define a0 ymm8
+%define a1 ymm9
+%define a2 ymm10
+
+%define TT0 ymm14
+%define TT1 ymm13
+%define TT2 ymm12
+%define TT3 ymm11
+%define TT4 ymm10
+%define TT5 ymm9
+
+%define T1  ymm14
+%define TMP ymm15
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpslld	%%tmp, %%reg, (32-(%%imm))
+	vpsrld	%%reg, %%reg, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	vpslld	%%tmp, %%src, (32-(%%imm))
+	vpsrld	%%reg, %%src, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+	PRORD	%1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+	PRORD_nd	%1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15_R 3
+%define %%T1 %1
+%define %%i  %2
+%define %%data %3
+
+	PRORD_nd	a0, e, (11-6)	; sig1: a0 = (e >> 5)
+
+	vpxor	a2, f, g	; ch: a2 = f^g
+	vpand	a2, a2, e		; ch: a2 = (f^g)&e
+	vpxor	a2, a2, g		; a2 = ch
+
+	PRORD_nd	a1, e, 25		; sig1: a1 = (e >> 25)
+	vmovdqa	%%T1, [SZ8*(%%i&0xf) + %%data]
+	vpaddd	%%T1, %%T1, [TBL + ROUND]	; T1 = W + K
+	vpxor	a0, a0, e	; sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		; sig1: a0 = (e >> 6) ^ (e >> 11)
+	vpaddd	h, h, a2	; h = h + ch
+	PRORD_nd	a2, a, (13-2)	; sig0: a2 = (a >> 11)
+	vpaddd	h, h, %%T1	; h = h + ch + W + K
+	vpxor	a0, a0, a1	; a0 = sigma1
+	PRORD_nd	a1, a, 22	; sig0: a1 = (a >> 22)
+	vpxor	%%T1, a, c	; maj: T1 = a^c
+	add	ROUND, SZ8	; ROUND++
+	vpand	%%T1, %%T1, b	; maj: T1 = (a^c)&b
+	vpaddd	h, h, a0
+
+	vpaddd	d, d, h
+
+	vpxor	a2, a2, a	; sig0: a2 = a ^ (a >> 11)
+	PRORD	a2, 2		; sig0: a2 = (a >> 2) ^ (a >> 13)
+	vpxor	a2, a2, a1	; a2 = sig0
+	vpand	a1, a, c	; maj: a1 = a&c
+	vpor	a1, a1, %%T1	; a1 = maj
+	vpaddd	h, h, a1	; h = h + ch + W + K + maj
+	vpaddd	h, h, a2	; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15_W 3
+%define %%T1 %1
+%define %%i  %2
+%define %%data %3
+
+	PRORD_nd	a0, e, (11-6)	; sig1: a0 = (e >> 5)
+
+	vpxor	a2, f, g	; ch: a2 = f^g
+	vpand	a2, a2, e		; ch: a2 = (f^g)&e
+	vpxor	a2, a2, g		; a2 = ch
+
+	PRORD_nd	a1, e, 25		; sig1: a1 = (e >> 25)
+	vmovdqa	[SZ8*(%%i&0xf) + %%data], %%T1
+	vpaddd	%%T1, %%T1, [TBL + ROUND]	; T1 = W + K
+	vpxor	a0, a0, e	; sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		; sig1: a0 = (e >> 6) ^ (e >> 11)
+	vpaddd	h, h, a2	; h = h + ch
+	PRORD_nd	a2, a, (13-2)	; sig0: a2 = (a >> 11)
+	vpaddd	h, h, %%T1	; h = h + ch + W + K
+	vpxor	a0, a0, a1	; a0 = sigma1
+	PRORD_nd	a1, a, 22	; sig0: a1 = (a >> 22)
+	vpxor	%%T1, a, c	; maj: T1 = a^c
+	add	ROUND, SZ8	; ROUND++
+	vpand	%%T1, %%T1, b	; maj: T1 = (a^c)&b
+	vpaddd	h, h, a0
+
+	vpaddd	d, d, h
+
+	vpxor	a2, a2, a	; sig0: a2 = a ^ (a >> 11)
+	PRORD	a2, 2		; sig0: a2 = (a >> 2) ^ (a >> 13)
+	vpxor	a2, a2, a1	; a2 = sig0
+	vpand	a1, a, c	; maj: a1 = a&c
+	vpor	a1, a1, %%T1	; a1 = maj
+	vpaddd	h, h, a1	; h = h + ch + W + K + maj
+	vpaddd	h, h, a2	; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 3
+%define %%T1 %1
+%define %%i  %2
+%define %%data %3
+
+	vmovdqa	%%T1, [SZ8*((%%i-15)&0xf) + %%data]
+	vmovdqa	a1, [SZ8*((%%i-2)&0xf) + %%data]
+	vmovdqa	a0, %%T1
+	PRORD	%%T1, 18-7
+	vmovdqa	a2, a1
+	PRORD	a1, 19-17
+	vpxor	%%T1, %%T1, a0
+	PRORD	%%T1, 7
+	vpxor	a1, a1, a2
+	PRORD	a1, 17
+	vpsrld	a0, a0, 3
+	vpxor	%%T1, %%T1, a0
+	vpsrld	a2, a2, 10
+	vpxor	a1, a1, a2
+	vpaddd	%%T1, %%T1, [SZ8*((%%i-16)&0xf) + %%data]
+	vpaddd	a1, a1, [SZ8*((%%i-7)&0xf) + %%data]
+	vpaddd	%%T1, %%T1, a1
+
+	ROUND_00_15_W %%T1, %%i, %%data
+%endm
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a  | b  |  c | ...|  p | (16)
+; h0 | h0 | h0 | ...| h0 |    | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 |    | Ba| Bb | Bc |...| Bp |
+; ....
+; h7 | h7 | h7 | ...| h7 |    | Ha| Hb | Hc |...| Hp |
+
+align 32
+
+;void mh_sha256_block_avx2(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+;		uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number  of 1KB blocks
+;
+mk_global mh_sha256_block_avx2, function, internal
+func(mh_sha256_block_avx2)
+	endbranch
+	FUNC_SAVE
+	; save rsp
+	mov	RSP_SAVE, rsp
+
+	cmp	loops, 0
+	jle	.return
+
+	; leave enough space to store segs_digests
+	sub     rsp, FRAMESZ
+	; align rsp to 32 Bytes needed by avx2
+	and	rsp, ~0x1F
+	lea	TBL,[TABLE]
+
+ %assign I 0					; copy segs_digests into stack
+ %rep 4
+	VMOVPS  a, [mh_digests_p + I*64*2 + 32*0]
+	VMOVPS  b, [mh_digests_p + I*64*2 + 32*1]
+	VMOVPS  c, [mh_digests_p + I*64*2 + 32*2]
+	VMOVPS  d, [mh_digests_p + I*64*2 + 32*3]
+
+	vmovdqa [rsp + I*64*2 + 32*0], a
+	vmovdqa [rsp + I*64*2 + 32*1], b
+	vmovdqa [rsp + I*64*2 + 32*2], c
+	vmovdqa [rsp + I*64*2 + 32*3], d
+ %assign I (I+1)
+ %endrep
+
+.block_loop:
+	;transform to big-endian data and store on aligned_frame
+	vmovdqa  TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+	;transform input data from DWORD*16_SEGS*8 to DWORD*8_SEGS*8*2
+ %assign I 0
+ %rep 16
+	VMOVPS   TT0,[mh_in_p + I*64+0*32]
+	VMOVPS   TT1,[mh_in_p + I*64+1*32]
+
+	vpshufb  TT0, TT0, TMP
+	vmovdqa  [mh_data_p +I*32 +0*512],TT0
+	vpshufb  TT1, TT1, TMP
+	vmovdqa  [mh_data_p +I*32 +1*512],TT1
+ %assign I (I+1)
+ %endrep
+
+	mov	mh_segs, 0			;start from the first 8 segments
+	mov	pref, 1024			;avoid prefetch repeadtedly
+ .segs_loop:
+	xor	ROUND, ROUND
+	;; Initialize digests
+	vmovdqa  a, [rsp + 0*64 + mh_segs]
+	vmovdqa  b, [rsp + 1*64 + mh_segs]
+	vmovdqa  c, [rsp + 2*64 + mh_segs]
+	vmovdqa  d, [rsp + 3*64 + mh_segs]
+	vmovdqa  e, [rsp + 4*64 + mh_segs]
+	vmovdqa  f, [rsp + 5*64 + mh_segs]
+	vmovdqa  g, [rsp + 6*64 + mh_segs]
+	vmovdqa  h, [rsp + 7*64 + mh_segs]
+
+  %assign i 0
+  %rep 4
+	ROUND_00_15_R	TT0, (i*4+0), mh_data_p
+	ROUND_00_15_R	TT1, (i*4+1), mh_data_p
+	ROUND_00_15_R	TT2, (i*4+2), mh_data_p
+	ROUND_00_15_R	TT3, (i*4+3), mh_data_p
+  %assign i (i+1)
+  %endrep
+	PREFETCH_X [mh_in_p + pref+128*0]
+
+  %assign i 16
+  %rep 48
+	ROUND_16_XX	T1, i, mh_data_p
+	%if i % 16 = 8
+		PREFETCH_X [mh_in_p + pref+128*(i/16)]
+	%endif
+  %assign i (i+1)
+  %endrep
+
+	;; add old digest
+	vpaddd	a, a, [rsp + 0*64 + mh_segs]
+	vpaddd	b, b, [rsp + 1*64 + mh_segs]
+	vpaddd	c, c, [rsp + 2*64 + mh_segs]
+	vpaddd	d, d, [rsp + 3*64 + mh_segs]
+	vpaddd	e, e, [rsp + 4*64 + mh_segs]
+	vpaddd	f, f, [rsp + 5*64 + mh_segs]
+	vpaddd	g, g, [rsp + 6*64 + mh_segs]
+	vpaddd	h, h, [rsp + 7*64 + mh_segs]
+
+	; write out digests
+	vmovdqa  [rsp + 0*64 + mh_segs], a
+	vmovdqa  [rsp + 1*64 + mh_segs], b
+	vmovdqa  [rsp + 2*64 + mh_segs], c
+	vmovdqa  [rsp + 3*64 + mh_segs], d
+	vmovdqa  [rsp + 4*64 + mh_segs], e
+	vmovdqa  [rsp + 5*64 + mh_segs], f
+	vmovdqa  [rsp + 6*64 + mh_segs], g
+	vmovdqa  [rsp + 7*64 + mh_segs], h
+
+	add	pref,      512
+	add	mh_data_p, 512
+	add 	mh_segs,   32
+	cmp	mh_segs,   64
+	jc 	.segs_loop
+
+	sub	mh_data_p, (1024)
+	add 	mh_in_p,   (1024)
+	sub     loops,     1
+	jne     .block_loop
+
+ %assign I 0					; copy segs_digests back to mh_digests_p
+ %rep 4
+	vmovdqa a, [rsp + I*64*2 + 32*0]
+	vmovdqa b, [rsp + I*64*2 + 32*1]
+	vmovdqa c, [rsp + I*64*2 + 32*2]
+	vmovdqa d, [rsp + I*64*2 + 32*3]
+
+	VMOVPS  [mh_digests_p + I*64*2 + 32*0], a
+	VMOVPS  [mh_digests_p + I*64*2 + 32*1], b
+	VMOVPS  [mh_digests_p + I*64*2 + 32*2], c
+	VMOVPS  [mh_digests_p + I*64*2 + 32*3], d
+ %assign I (I+1)
+ %endrep
+	mov	rsp, RSP_SAVE			; restore rsp
+
+.return:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data align=64
+
+align 64
+TABLE:
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm
new file mode 100644
index 000000000..1ee76ddfc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm
@@ -0,0 +1,682 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA256 using AVX-512
+;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp1  r10
+ %define tmp2  r11
+ %define tmp3  r12		; must be saved and restored
+ %define tmp4  r13		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r10
+ %define arg5   r11
+ %define tmp1   r12		; must be saved and restored
+ %define tmp2   r13		; must be saved and restored
+ %define tmp3   r14		; must be saved and restored
+ %define tmp4   r15		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define return rax
+
+ %define stack_size  10*16 + 7*8		; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	save_reg	rsi,  10*16 + 5*8
+	end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	mov	rsi,  [rsp + 10*16 + 5*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops 		arg3
+;variables of mh_sha256
+%define mh_in_p  	arg0
+%define mh_digests_p 	arg1
+%define mh_data_p	arg2
+;variables used by storing segs_digests on stack
+%define RSP_SAVE	tmp2
+%define FRAMESZ 	4*8*16		;BYTES*DWORDS*SEGS
+; Common definitions
+%define ROUND	tmp4
+%define TBL	tmp5
+
+%define pref	tmp3
+%macro PREFETCH_X 1
+%define %%mem  %1
+	prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS  vmovups
+
+%define A	zmm0
+%define B	zmm1
+%define C	zmm2
+%define D	zmm3
+%define E	zmm4
+%define F	zmm5
+%define G	zmm6
+%define H	zmm7
+%define T1	zmm8
+%define TMP0	zmm9
+%define TMP1	zmm10
+%define TMP2	zmm11
+%define TMP3	zmm12
+%define TMP4	zmm13
+%define TMP5	zmm14
+%define TMP6	zmm15
+
+%define W0	zmm16
+%define W1	zmm17
+%define W2	zmm18
+%define W3	zmm19
+%define W4	zmm20
+%define W5	zmm21
+%define W6	zmm22
+%define W7	zmm23
+%define W8	zmm24
+%define W9	zmm25
+%define W10	zmm26
+%define W11	zmm27
+%define W12	zmm28
+%define W13	zmm29
+%define W14	zmm30
+%define W15	zmm31
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ROTATE_ARGS 0
+%xdefine TMP_ H
+%xdefine H G
+%xdefine G F
+%xdefine F E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define APPEND(a,b) a %+ b
+;;  CH(A, B, C) = (A&B) ^ (~A&C)
+;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
+;; SIGMA0 = ROR_2  ^ ROR_13 ^ ROR_22
+;; SIGMA1 = ROR_6  ^ ROR_11 ^ ROR_25
+;; sigma0 = ROR_7  ^ ROR_18 ^ SHR_3
+;; sigma1 = ROR_17 ^ ROR_19 ^ SHR_10
+
+; Main processing loop per round
+%macro PROCESS_LOOP 2
+%define %%WT	%1
+%define %%ROUND	%2
+	;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
+	;; T2 = SIGMA0(A) + MAJ(A, B, C)
+	;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+	;; H becomes T2, then add T1 for A
+	;; D becomes D + T1 for E
+
+	vpaddd		T1, H, TMP3		; T1 = H + Kt
+	vmovdqa32	TMP0, E
+	vprord		TMP1, E, 6 		; ROR_6(E)
+	vprord		TMP2, E, 11 		; ROR_11(E)
+	vprord		TMP3, E, 25 		; ROR_25(E)
+	vpternlogd	TMP0, F, G, 0xCA	; TMP0 = CH(E,F,G)
+	vpaddd		T1, T1, %%WT		; T1 = T1 + Wt
+	vpternlogd	TMP1, TMP2, TMP3, 0x96	; TMP1 = SIGMA1(E)
+	vpaddd		T1, T1, TMP0		; T1 = T1 + CH(E,F,G)
+	vpaddd		T1, T1, TMP1		; T1 = T1 + SIGMA1(E)
+	vpaddd		D, D, T1		; D = D + T1
+
+	vprord		H, A, 2 		; ROR_2(A)
+	vprord		TMP2, A, 13 		; ROR_13(A)
+	vprord		TMP3, A, 22 		; ROR_22(A)
+	vmovdqa32	TMP0, A
+	vpternlogd	TMP0, B, C, 0xE8	; TMP0 = MAJ(A,B,C)
+	vpternlogd	H, TMP2, TMP3, 0x96	; H(T2) = SIGMA0(A)
+	vpaddd		H, H, TMP0		; H(T2) = SIGMA0(A) + MAJ(A,B,C)
+	vpaddd		H, H, T1		; H(A) = H(T2) + T1
+
+	vmovdqa32	TMP3, [TBL + ((%%ROUND+1)*64)]	; Next Kt
+
+	;; Rotate the args A-H (rotation of names associated with regs)
+	ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_63 4
+%define %%WT	%1
+%define %%WTp1	%2
+%define %%WTp9	%3
+%define %%WTp14	%4
+	vprord		TMP4, %%WTp14, 17 	; ROR_17(Wt-2)
+	vprord		TMP5, %%WTp14, 19 	; ROR_19(Wt-2)
+	vpsrld		TMP6, %%WTp14, 10 	; SHR_10(Wt-2)
+	vpternlogd	TMP4, TMP5, TMP6, 0x96	; TMP4 = sigma1(Wt-2)
+
+	vpaddd		%%WT, %%WT, TMP4	; Wt = Wt-16 + sigma1(Wt-2)
+	vpaddd		%%WT, %%WT, %%WTp9	; Wt = Wt-16 + sigma1(Wt-2) + Wt-7
+
+	vprord		TMP4, %%WTp1, 7 	; ROR_7(Wt-15)
+	vprord		TMP5, %%WTp1, 18 	; ROR_18(Wt-15)
+	vpsrld		TMP6, %%WTp1, 3 	; SHR_3(Wt-15)
+	vpternlogd	TMP4, TMP5, TMP6, 0x96	; TMP4 = sigma0(Wt-15)
+
+	vpaddd		%%WT, %%WT, TMP4	; Wt = Wt-16 + sigma1(Wt-2) +
+						;      Wt-7 + sigma0(Wt-15) +
+%endmacro
+
+; Note this is reading in a block of data for one lane
+; When all 16 are read, the data must be transposed to build msg schedule
+%macro MSG_SCHED_ROUND_00_15 2
+%define %%WT	 %1
+%define %%OFFSET %2
+	mov		inp0, [IN + (%%OFFSET*8)]
+	vmovups		%%WT, [inp0+IDX]
+%endmacro
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a  | b  |  c | ...|  p | (16)
+; h0 | h0 | h0 | ...| h0 |    | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 |    | Ba| Bb | Bc |...| Bp |
+; ....
+; h7 | h7 | h7 | ...| h7 |    | Ha| Hb | Hc |...| Hp |
+
+[bits 64]
+section .text
+align 32
+
+;void mh_sha256_block_avx512(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+;		uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number  of 1KB blocks
+;
+global mh_sha256_block_avx512
+func(mh_sha256_block_avx512)
+	endbranch
+	FUNC_SAVE
+	; save rsp
+	mov	RSP_SAVE, rsp
+
+	cmp	loops, 0
+	jle	.return
+
+	; leave enough space to store segs_digests
+	sub     rsp, FRAMESZ
+	; align rsp to 64 Bytes needed by avx512
+	and	rsp, ~0x3F
+	lea	TBL,[TABLE]
+
+	; copy segs_digests into stack and ZMM
+	VMOVPS  A, [mh_digests_p + 64*0]
+	VMOVPS  B, [mh_digests_p + 64*1]
+	VMOVPS  C, [mh_digests_p + 64*2]
+	VMOVPS  D, [mh_digests_p + 64*3]
+	VMOVPS  E, [mh_digests_p + 64*4]
+	VMOVPS  F, [mh_digests_p + 64*5]
+	VMOVPS  G, [mh_digests_p + 64*6]
+	VMOVPS  H, [mh_digests_p + 64*7]
+
+.block_loop:
+	; Save digests for later addition
+	vmovdqa32 [rsp + 64*0], A
+	vmovdqa32 [rsp + 64*1], B
+	vmovdqa32 [rsp + 64*2], C
+	vmovdqa32 [rsp + 64*3], D
+	vmovdqa32 [rsp + 64*4], E
+	vmovdqa32 [rsp + 64*5], F
+	vmovdqa32 [rsp + 64*6], G
+	vmovdqa32 [rsp + 64*7], H
+
+	vmovdqa32	TMP3, [TBL]	; First K
+	;transform to big-endian data and store on aligned_frame
+	vmovdqa32	TMP2, [PSHUFFLE_BYTE_FLIP_MASK]
+	;using extra 16 ZMM registers instead of heap
+%assign I 0
+%rep 8
+%assign J (I+1)
+	VMOVPS	APPEND(W,I),[mh_in_p + I*64+0*64]
+	VMOVPS	APPEND(W,J),[mh_in_p + I*64+1*64]
+
+	vpshufb	APPEND(W,I), APPEND(W,I), TMP2
+	vpshufb	APPEND(W,J), APPEND(W,J), TMP2
+%assign I (I+2)
+%endrep
+
+	; MSG Schedule for W0-W15 is now complete in registers
+	; Process first 48 rounds
+	; Calculate next Wt+16 after processing is complete and Wt is unneeded
+
+	; PROCESS_LOOP_00_47 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M)
+
+%assign I 0
+%assign J 0
+%assign K 1
+%assign L 9
+%assign M 14
+%rep 64
+	PROCESS_LOOP  APPEND(W,J),  I
+	%if I < 48
+	MSG_SCHED_ROUND_16_63  APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+	%endif
+	%if I % 8 = 4
+		PREFETCH_X [mh_in_p + 1024+128*(I / 8)]
+	%endif
+%assign I (I+1)
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%endrep
+
+	;; add old digest
+	vpaddd	A, A, [rsp + 0*64]
+	vpaddd	B, B, [rsp + 1*64]
+	vpaddd	C, C, [rsp + 2*64]
+	vpaddd	D, D, [rsp + 3*64]
+	vpaddd	E, E, [rsp + 4*64]
+	vpaddd	F, F, [rsp + 5*64]
+	vpaddd	G, G, [rsp + 6*64]
+	vpaddd	H, H, [rsp + 7*64]
+
+	add 	mh_in_p,   1024
+	sub     loops, 1
+	jne     .block_loop
+
+	; copy segs_digests back to mh_digests_p
+
+	VMOVPS  [mh_digests_p + 64*0], A
+	VMOVPS  [mh_digests_p + 64*1], B
+	VMOVPS  [mh_digests_p + 64*2], C
+	VMOVPS  [mh_digests_p + 64*3], D
+	VMOVPS  [mh_digests_p + 64*4], E
+	VMOVPS  [mh_digests_p + 64*5], F
+	VMOVPS  [mh_digests_p + 64*6], G
+	VMOVPS  [mh_digests_p + 64*7], H
+
+	mov	rsp, RSP_SAVE			; restore rsp
+
+.return:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+align 64
+TABLE:
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_mh_sha256_block_avx512
+no_mh_sha256_block_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
+
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_base.c
new file mode 100644
index 000000000..8d9a828c6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_base.c
@@ -0,0 +1,188 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "mh_sha256_internal.h"
+#include <string.h>
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Base multi-hash SHA256 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// store_w is only used for step 0 ~ 15
+#define store_w(s, i, w, ww) (w[i][s] = to_be32(ww[i*HASH_SEGS+s]))
+#define Ws(x, s) w[(x) & 15][s]
+// update_w is used for step > 15
+#define update_w(s, i, w) \
+	Ws(i, s) = Ws(i-16, s) + S0(Ws(i-15, s)) + Ws(i-7, s) + S1(Ws(i-2, s))
+#define update_t2(s, a, b, c) t2[s] = s0(a[s]) + maj(a[s],b[s],c[s])
+#define update_t1(s, h, e, f, g, i, k) \
+	t1[s] = h[s] + s1(e[s]) + ch(e[s],f[s],g[s]) + k + Ws(i, s);
+#define update_d(s) d[s] += t1[s]
+#define update_h(s) h[s] = t1[s] + t2[s]
+
+// s is a iterator
+#define STORE_W(s, i, w, ww) \
+	for(s = 0; s < HASH_SEGS; s++) \
+		store_w(s, i, w, ww);
+#define UPDATE_W(s, i, w) \
+	for(s = 0; s < HASH_SEGS; s++) \
+		update_w(s, i, w);
+#define UPDATE_T2(s, a, b, c) \
+	for(s = 0; s < HASH_SEGS; s++) \
+		update_t2(s, a, b, c);
+#define UPDATE_T1(s, h, e, f, g, i, k) \
+	for(s = 0; s < HASH_SEGS; s++) \
+		update_t1(s, h, e, f, g, i, k);
+#define UPDATE_D(s) \
+	for(s = 0; s < HASH_SEGS; s++) \
+		update_d(s);
+#define UPDATE_H(s) \
+	for(s = 0; s < HASH_SEGS; s++) \
+		update_h(s);
+
+static inline void step(int i, uint32_t * a, uint32_t * b, uint32_t * c,
+			uint32_t * d, uint32_t * e, uint32_t * f,
+			uint32_t * g, uint32_t * h, uint32_t k,
+			uint32_t * t1, uint32_t * t2, uint32_t(*w)[HASH_SEGS], uint32_t * ww)
+{
+	uint8_t s;
+	if (i < 16) {
+		STORE_W(s, i, w, ww);
+	} else {
+		UPDATE_W(s, i, w);
+	}
+	UPDATE_T2(s, a, b, c);
+	UPDATE_T1(s, h, e, f, g, i, k);
+	UPDATE_D(s);
+	UPDATE_H(s);
+}
+
+static inline void init_abcdefgh(uint32_t * xx, uint32_t n,
+				 uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS])
+{
+	uint8_t s;
+	for (s = 0; s < HASH_SEGS; s++)
+		xx[s] = digests[n][s];
+}
+
+static inline void add_abcdefgh(uint32_t * xx, uint32_t n,
+				uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS])
+{
+	uint8_t s;
+	for (s = 0; s < HASH_SEGS; s++)
+		digests[n][s] += xx[s];
+}
+
+/*
+ * API to perform 0-64 steps of the multi-hash algorithm for
+ * a single block of data. The caller is responsible for ensuring
+ * a full block of data input.
+ *
+ * Argument:
+ *   input  - the pointer to the data
+ *   digest - the space to hold the digests for all segments.
+ *
+ * Return:
+ *   N/A
+ */
+void mh_sha256_single(const uint8_t * input, uint32_t(*digests)[HASH_SEGS],
+		      uint8_t * frame_buffer)
+{
+	uint8_t i;
+	uint32_t aa[HASH_SEGS], bb[HASH_SEGS], cc[HASH_SEGS], dd[HASH_SEGS];
+	uint32_t ee[HASH_SEGS], ff[HASH_SEGS], gg[HASH_SEGS], hh[HASH_SEGS];
+	uint32_t t1[HASH_SEGS], t2[HASH_SEGS];
+	uint32_t *ww = (uint32_t *) input;
+	uint32_t(*w)[HASH_SEGS];
+
+	const static uint32_t k[64] = {
+		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+	};
+
+	w = (uint32_t(*)[HASH_SEGS]) frame_buffer;
+
+	init_abcdefgh(aa, 0, digests);
+	init_abcdefgh(bb, 1, digests);
+	init_abcdefgh(cc, 2, digests);
+	init_abcdefgh(dd, 3, digests);
+	init_abcdefgh(ee, 4, digests);
+	init_abcdefgh(ff, 5, digests);
+	init_abcdefgh(gg, 6, digests);
+	init_abcdefgh(hh, 7, digests);
+
+	for (i = 0; i < 64; i += 8) {
+		step(i, aa, bb, cc, dd, ee, ff, gg, hh, k[i], t1, t2, w, ww);
+		step(i + 1, hh, aa, bb, cc, dd, ee, ff, gg, k[i + 1], t1, t2, w, ww);
+		step(i + 2, gg, hh, aa, bb, cc, dd, ee, ff, k[i + 2], t1, t2, w, ww);
+		step(i + 3, ff, gg, hh, aa, bb, cc, dd, ee, k[i + 3], t1, t2, w, ww);
+		step(i + 4, ee, ff, gg, hh, aa, bb, cc, dd, k[i + 4], t1, t2, w, ww);
+		step(i + 5, dd, ee, ff, gg, hh, aa, bb, cc, k[i + 5], t1, t2, w, ww);
+		step(i + 6, cc, dd, ee, ff, gg, hh, aa, bb, k[i + 6], t1, t2, w, ww);
+		step(i + 7, bb, cc, dd, ee, ff, gg, hh, aa, k[i + 7], t1, t2, w, ww);
+	}
+
+	add_abcdefgh(aa, 0, digests);
+	add_abcdefgh(bb, 1, digests);
+	add_abcdefgh(cc, 2, digests);
+	add_abcdefgh(dd, 3, digests);
+	add_abcdefgh(ee, 4, digests);
+	add_abcdefgh(ff, 5, digests);
+	add_abcdefgh(gg, 6, digests);
+	add_abcdefgh(hh, 7, digests);
+}
+
+void mh_sha256_block_base(const uint8_t * input_data,
+			  uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+			  uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks)
+{
+	uint32_t i;
+
+	for (i = 0; i < num_blocks; i++) {
+		mh_sha256_single(input_data, digests, frame_buffer);
+		input_data += MH_SHA256_BLOCK_SIZE;
+	}
+
+	return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm
new file mode 100644
index 000000000..b1d6fd9ea
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm
@@ -0,0 +1,557 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA256 using SSE
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp1  r10
+ %define tmp2  r11
+ %define tmp3  r12		; must be saved and restored
+ %define tmp4  r13		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r10
+ %define arg5   r11
+ %define tmp1   r12		; must be saved and restored
+ %define tmp2   r13		; must be saved and restored
+ %define tmp3   r14		; must be saved and restored
+ %define tmp4   r15		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define return rax
+
+ %define stack_size  10*16 + 7*8		; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	save_reg	rsi,  10*16 + 5*8
+	end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	mov	rsi,  [rsp + 10*16 + 5*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops 		arg3
+;variables of mh_sha256
+%define mh_in_p  	arg0
+%define mh_digests_p 	arg1
+%define mh_data_p	arg2
+%define mh_segs  	tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE	tmp2
+%define FRAMESZ 	4*8*16		;BYTES*DWORDS*SEGS
+
+; Common definitions
+%define ROUND	tmp4
+%define TBL	tmp5
+
+%define pref	tmp3
+%macro PREFETCH_X 1
+%define %%mem  %1
+	prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define MOVPS  movups
+
+%define SZ	4
+%define SZ4	4*SZ
+%define ROUNDS 64*SZ4
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1  xmm14
+%define TMP xmm15
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	movdqa  %%tmp, %%reg
+	psrld   %%reg, %%imm
+	pslld   %%tmp, (32-(%%imm))
+	por     %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+	PRORD	%1, %2, TMP
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15_R 3
+%define %%T1 %1
+%define %%i  %2
+%define %%data %3
+
+	movdqa	a0, e		; sig1: a0 = e
+	movdqa	a1, e		; sig1: s1 = e
+	PRORD	a0, (11-6)	; sig1: a0 = (e >> 5)
+
+	movdqa	a2, f		; ch: a2 = f
+	pxor	a2, g		; ch: a2 = f^g
+	pand	a2, e		; ch: a2 = (f^g)&e
+	pxor	a2, g		; a2 = ch
+
+	PRORD	a1, 25		; sig1: a1 = (e >> 25)
+	movdqa	%%T1,[SZ4*(%%i&0xf) + %%data]
+	paddd	%%T1,[TBL + ROUND]	; T1 = W + K
+	pxor	a0, e		; sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		; sig1: a0 = (e >> 6) ^ (e >> 11)
+	paddd	h, a2		; h = h + ch
+	movdqa	a2, a		; sig0: a2 = a
+	PRORD	a2, (13-2)	; sig0: a2 = (a >> 11)
+	paddd	h, %%T1		; h = h + ch + W + K
+	pxor	a0, a1		; a0 = sigma1
+	movdqa	a1, a		; sig0: a1 = a
+	movdqa	%%T1, a		; maj: T1 = a
+	PRORD	a1, 22		; sig0: a1 = (a >> 22)
+	pxor	%%T1, c		; maj: T1 = a^c
+	add	ROUND, SZ4	; ROUND++
+	pand	%%T1, b		; maj: T1 = (a^c)&b
+	paddd	h, a0
+
+	paddd	d, h
+
+	pxor	a2, a		; sig0: a2 = a ^ (a >> 11)
+	PRORD	a2, 2		; sig0: a2 = (a >> 2) ^ (a >> 13)
+	pxor	a2, a1		; a2 = sig0
+	movdqa	a1, a		; maj: a1 = a
+	pand	a1, c		; maj: a1 = a&c
+	por	a1, %%T1	; a1 = maj
+	paddd	h, a1		; h = h + ch + W + K + maj
+	paddd	h, a2		; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15_W 3
+%define %%T1 %1
+%define %%i  %2
+%define %%data %3
+
+	movdqa	a0, e		; sig1: a0 = e
+	movdqa	a1, e		; sig1: s1 = e
+	PRORD	a0, (11-6)	; sig1: a0 = (e >> 5)
+
+	movdqa	a2, f		; ch: a2 = f
+	pxor	a2, g		; ch: a2 = f^g
+	pand	a2, e		; ch: a2 = (f^g)&e
+	pxor	a2, g		; a2 = ch
+
+	PRORD	a1, 25		; sig1: a1 = (e >> 25)
+	movdqa	[SZ4*(%%i&0xf) + %%data], %%T1
+	paddd	%%T1,[TBL + ROUND]	; T1 = W + K
+	pxor	a0, e		; sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		; sig1: a0 = (e >> 6) ^ (e >> 11)
+	paddd	h, a2		; h = h + ch
+	movdqa	a2, a		; sig0: a2 = a
+	PRORD	a2, (13-2)	; sig0: a2 = (a >> 11)
+	paddd	h, %%T1		; h = h + ch + W + K
+	pxor	a0, a1		; a0 = sigma1
+	movdqa	a1, a		; sig0: a1 = a
+	movdqa	%%T1, a		; maj: T1 = a
+	PRORD	a1, 22		; sig0: a1 = (a >> 22)
+	pxor	%%T1, c		; maj: T1 = a^c
+	add	ROUND, SZ4	; ROUND++
+	pand	%%T1, b		; maj: T1 = (a^c)&b
+	paddd	h, a0
+
+	paddd	d, h
+
+	pxor	a2, a		; sig0: a2 = a ^ (a >> 11)
+	PRORD	a2, 2		; sig0: a2 = (a >> 2) ^ (a >> 13)
+	pxor	a2, a1		; a2 = sig0
+	movdqa	a1, a		; maj: a1 = a
+	pand	a1, c		; maj: a1 = a&c
+	por	a1, %%T1	; a1 = maj
+	paddd	h, a1		; h = h + ch + W + K + maj
+	paddd	h, a2		; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 3
+%define %%T1 %1
+%define %%i  %2
+%define %%data %3
+
+	movdqa	%%T1, [SZ4*((%%i-15)&0xf) + %%data]
+	movdqa	a1, [SZ4*((%%i-2)&0xf) + %%data]
+	movdqa	a0, %%T1
+	PRORD	%%T1, 18-7
+	movdqa	a2, a1
+	PRORD	a1, 19-17
+	pxor	%%T1, a0
+	PRORD	%%T1, 7
+	pxor	a1, a2
+	PRORD	a1, 17
+	psrld	a0, 3
+	pxor	%%T1, a0
+	psrld	a2, 10
+	pxor	a1, a2
+	paddd	%%T1, [SZ4*((%%i-16)&0xf) + %%data]
+	paddd	a1, [SZ4*((%%i-7)&0xf) + %%data]
+	paddd	%%T1, a1
+
+	ROUND_00_15_W %%T1, %%i, %%data
+
+%endm
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a  | b  |  c | ...|  p | (16)
+; h0 | h0 | h0 | ...| h0 |    | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 |    | Ba| Bb | Bc |...| Bp |
+; ....
+; h7 | h7 | h7 | ...| h7 |    | Ha| Hb | Hc |...| Hp |
+
+align 32
+
+;void mh_sha256_block_sse(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+;		uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number  of 1KB blocks
+;
+mk_global mh_sha256_block_sse, function, internal
+func(mh_sha256_block_sse)
+	endbranch
+	FUNC_SAVE
+	; save rsp
+	mov	RSP_SAVE, rsp
+
+	cmp	loops, 0
+	jle	.return
+
+	; leave enough space to store segs_digests
+	sub     rsp, FRAMESZ
+	; align rsp to 16 Bytes needed by sse
+	and	rsp, ~0x0F
+	lea	TBL,[TABLE]
+
+ %assign I 0					; copy segs_digests into stack
+ %rep 8
+	MOVPS  a, [mh_digests_p + I*64 + 16*0]
+	MOVPS  b, [mh_digests_p + I*64 + 16*1]
+	MOVPS  c, [mh_digests_p + I*64 + 16*2]
+	MOVPS  d, [mh_digests_p + I*64 + 16*3]
+
+	movdqa [rsp + I*64 + 16*0], a
+	movdqa [rsp + I*64 + 16*1], b
+	movdqa [rsp + I*64 + 16*2], c
+	movdqa [rsp + I*64 + 16*3], d
+ %assign I (I+1)
+ %endrep
+
+.block_loop:
+	;transform to big-endian data and store on aligned_frame
+	movdqa  TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+	;transform input data from DWORD*16_SEGS*8 to DWORD*4_SEGS*8*4
+ %assign I 0
+ %rep 16
+	MOVPS   TT0,[mh_in_p + I*64+0*16]
+	MOVPS   TT1,[mh_in_p + I*64+1*16]
+	MOVPS   TT2,[mh_in_p + I*64+2*16]
+	MOVPS   TT3,[mh_in_p + I*64+3*16]
+
+	pshufb  TT0, TMP
+	movdqa  [mh_data_p +(I)*16 +0*256],TT0
+	pshufb  TT1, TMP
+	movdqa  [mh_data_p +(I)*16 +1*256],TT1
+	pshufb  TT2, TMP
+	movdqa  [mh_data_p +(I)*16 +2*256],TT2
+	pshufb  TT3, TMP
+	movdqa  [mh_data_p +(I)*16 +3*256],TT3
+ %assign I (I+1)
+ %endrep
+
+	mov	mh_segs, 0			;start from the first 4 segments
+	mov	pref, 1024			;avoid prefetch repeadtedly
+ .segs_loop:
+	xor	ROUND, ROUND
+	;; Initialize digests
+	movdqa  a, [rsp + 0*64 + mh_segs]
+	movdqa  b, [rsp + 1*64 + mh_segs]
+	movdqa  c, [rsp + 2*64 + mh_segs]
+	movdqa  d, [rsp + 3*64 + mh_segs]
+	movdqa  e, [rsp + 4*64 + mh_segs]
+	movdqa  f, [rsp + 5*64 + mh_segs]
+	movdqa  g, [rsp + 6*64 + mh_segs]
+	movdqa  h, [rsp + 7*64 + mh_segs]
+
+  %assign i 0
+  %rep 4
+	ROUND_00_15_R	TT0, (i*4+0), mh_data_p
+	ROUND_00_15_R	TT1, (i*4+1), mh_data_p
+	ROUND_00_15_R	TT2, (i*4+2), mh_data_p
+	ROUND_00_15_R	TT3, (i*4+3), mh_data_p
+  %assign i (i+1)
+  %endrep
+	PREFETCH_X [mh_in_p + pref+128*0]
+
+  %assign i 16
+  %rep 48
+	%if i = 48
+		PREFETCH_X [mh_in_p + pref+128*1]
+	%endif
+	ROUND_16_XX	T1, i, mh_data_p
+  %assign i (i+1)
+  %endrep
+
+	;; add old digest
+	paddd	a, [rsp + 0*64 + mh_segs]
+	paddd	b, [rsp + 1*64 + mh_segs]
+	paddd	c, [rsp + 2*64 + mh_segs]
+	paddd	d, [rsp + 3*64 + mh_segs]
+	paddd	e, [rsp + 4*64 + mh_segs]
+	paddd	f, [rsp + 5*64 + mh_segs]
+	paddd	g, [rsp + 6*64 + mh_segs]
+	paddd	h, [rsp + 7*64 + mh_segs]
+
+	; write out digests
+	movdqa  [rsp + 0*64 + mh_segs], a
+	movdqa  [rsp + 1*64 + mh_segs], b
+	movdqa  [rsp + 2*64 + mh_segs], c
+	movdqa  [rsp + 3*64 + mh_segs], d
+	movdqa  [rsp + 4*64 + mh_segs], e
+	movdqa  [rsp + 5*64 + mh_segs], f
+	movdqa  [rsp + 6*64 + mh_segs], g
+	movdqa  [rsp + 7*64 + mh_segs], h
+
+	add	pref,      256
+	add	mh_data_p, 256
+	add 	mh_segs,   16
+	cmp	mh_segs,   64
+	jc 	.segs_loop
+
+	sub	mh_data_p, (1024)
+	add 	mh_in_p,   (1024)
+	sub     loops,     1
+	jne     .block_loop
+
+ %assign I 0					; copy segs_digests back to mh_digests_p
+ %rep 8
+	movdqa a, [rsp + I*64 + 16*0]
+	movdqa b, [rsp + I*64 + 16*1]
+	movdqa c, [rsp + I*64 + 16*2]
+	movdqa d, [rsp + I*64 + 16*3]
+
+	MOVPS  [mh_digests_p + I*64 + 16*0], a
+	MOVPS  [mh_digests_p + I*64 + 16*1], b
+	MOVPS  [mh_digests_p + I*64 + 16*2], c
+	MOVPS  [mh_digests_p + I*64 + 16*3], d
+ %assign I (I+1)
+ %endrep
+	mov	rsp, RSP_SAVE			; restore rsp
+
+.return:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data align=16
+
+align 16
+TABLE:
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_finalize_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_finalize_base.c
new file mode 100644
index 000000000..6abb20688
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_finalize_base.c
@@ -0,0 +1,121 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/*
+ * mh_sha256_finalize_base.c contains the prototypes of mh_sha256_finalize_XXX
+ * and mh_sha256_tail_XXX. Default definitions are base type which generates
+ * mh_sha256_finalize_base and mh_sha256_tail_base. Other types are generated
+ * through different predefined macros by mh_sha256.c.
+ * mh_sha256_tail is used to calculate the last incomplete block of input
+ * data. mh_sha256_finalize is the mh_sha256_ctx wrapper of mh_sha256_tail.
+ */
+#ifndef MH_SHA256_FINALIZE_FUNCTION
+#include <string.h>
+#include "mh_sha256_internal.h"
+
+#define MH_SHA256_FINALIZE_FUNCTION	mh_sha256_finalize_base
+#define MH_SHA256_TAIL_FUNCTION		mh_sha256_tail_base
+#define MH_SHA256_BLOCK_FUNCTION	mh_sha256_block_base
+#define MH_SHA256_FINALIZE_SLVER
+#endif
+
+void MH_SHA256_TAIL_FUNCTION(uint8_t * partial_buffer, uint32_t total_len,
+			     uint32_t(*mh_sha256_segs_digests)[HASH_SEGS],
+			     uint8_t * frame_buffer, uint32_t digests[SHA256_DIGEST_WORDS])
+{
+	uint64_t partial_buffer_len, len_in_bit;
+
+	partial_buffer_len = total_len % MH_SHA256_BLOCK_SIZE;
+
+	// Padding the first block
+	partial_buffer[partial_buffer_len] = 0x80;
+	partial_buffer_len++;
+	memset(partial_buffer + partial_buffer_len, 0,
+	       MH_SHA256_BLOCK_SIZE - partial_buffer_len);
+
+	// Calculate the first block without total_length if padding needs 2 block
+	if (partial_buffer_len > (MH_SHA256_BLOCK_SIZE - 8)) {
+		MH_SHA256_BLOCK_FUNCTION(partial_buffer, mh_sha256_segs_digests, frame_buffer,
+					 1);
+		//Padding the second block
+		memset(partial_buffer, 0, MH_SHA256_BLOCK_SIZE);
+	}
+	//Padding the block
+	len_in_bit = to_be64((uint64_t) total_len * 8);
+	*(uint64_t *) (partial_buffer + MH_SHA256_BLOCK_SIZE - 8) = len_in_bit;
+	MH_SHA256_BLOCK_FUNCTION(partial_buffer, mh_sha256_segs_digests, frame_buffer, 1);
+
+	//Calculate multi-hash SHA256 digests (segment digests as input message)
+	sha256_for_mh_sha256((uint8_t *) mh_sha256_segs_digests, digests,
+			     4 * SHA256_DIGEST_WORDS * HASH_SEGS);
+
+	return;
+}
+
+int MH_SHA256_FINALIZE_FUNCTION(struct mh_sha256_ctx *ctx, void *mh_sha256_digest)
+{
+	uint8_t i;
+	uint8_t *partial_block_buffer;
+	uint64_t total_len;
+	uint32_t(*mh_sha256_segs_digests)[HASH_SEGS];
+	uint8_t *aligned_frame_buffer;
+
+	if (ctx == NULL)
+		return MH_SHA256_CTX_ERROR_NULL;
+
+	total_len = ctx->total_length;
+	partial_block_buffer = ctx->partial_block_buffer;
+
+	/* mh_sha256 tail */
+	aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer);
+	mh_sha256_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha256_interim_digests;
+
+	MH_SHA256_TAIL_FUNCTION(partial_block_buffer, total_len, mh_sha256_segs_digests,
+				aligned_frame_buffer, ctx->mh_sha256_digest);
+
+	/* Output the digests of mh_sha256 */
+	if (mh_sha256_digest != NULL) {
+		for (i = 0; i < SHA256_DIGEST_WORDS; i++)
+			((uint32_t *) mh_sha256_digest)[i] = ctx->mh_sha256_digest[i];
+	}
+
+	return MH_SHA256_CTX_ERROR_NONE;
+}
+
+#ifdef MH_SHA256_FINALIZE_SLVER
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+
+// Version info
+struct slver mh_sha256_finalize_base_slver_000002bb;
+struct slver mh_sha256_finalize_base_slver = { 0x02bb, 0x00, 0x00 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_internal.h b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_internal.h
new file mode 100644
index 000000000..8051e3f36
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_internal.h
@@ -0,0 +1,318 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA256_INTERNAL_H_
+#define _MH_SHA256_INTERNAL_H_
+
+/**
+ *  @file mh_sha256_internal.h
+ *  @brief mh_sha256 internal function prototypes and macros
+ *
+ *  Interface for mh_sha256 internal functions
+ *
+ */
+#include <stdint.h>
+#include "mh_sha256.h"
+#include "endian_helper.h"
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+ // 64byte pointer align
+#define ALIGN_64(pointer) ( ((uint64_t)(pointer) + 0x3F)&(~0x3F) )
+
+ /*******************************************************************
+  *mh_sha256 constants and macros
+  ******************************************************************/
+ /* mh_sha256 constants */
+#define MH_SHA256_H0 0x6a09e667UL
+#define MH_SHA256_H1 0xbb67ae85UL
+#define MH_SHA256_H2 0x3c6ef372UL
+#define MH_SHA256_H3 0xa54ff53aUL
+#define MH_SHA256_H4 0x510e527fUL
+#define MH_SHA256_H5 0x9b05688cUL
+#define MH_SHA256_H6 0x1f83d9abUL
+#define MH_SHA256_H7 0x5be0cd19UL
+
+ /* mh_sha256 macros */
+#define ror32(x, r) (((x)>>(r)) ^ ((x)<<(32-(r))))
+
+#define S0(w) (ror32(w,7) ^ ror32(w,18) ^ (w >> 3))
+#define S1(w) (ror32(w,17) ^ ror32(w,19) ^ (w >> 10))
+
+#define s0(a) (ror32(a,2) ^ ror32(a,13) ^ ror32(a,22))
+#define s1(e) (ror32(e,6) ^ ror32(e,11) ^ ror32(e,25))
+#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c))
+#define ch(e,f,g) ((e & f) ^ (g & ~e))
+
+ /*******************************************************************
+  * SHA256 API internal function prototypes
+  ******************************************************************/
+
+ /**
+  * @brief Performs complete SHA256 algorithm.
+  *
+  * @param input  Pointer to buffer containing the input message.
+  * @param digest Pointer to digest to update.
+  * @param len	  Length of buffer.
+  * @returns None
+  */
+ void sha256_for_mh_sha256(const uint8_t * input_data, uint32_t * digest, const uint32_t len);
+
+ /**
+  * @brief Calculate sha256 digest of blocks which size is SHA256_BLOCK_SIZE
+  *
+  * @param data   Pointer to data buffer containing the input message.
+  * @param digest Pointer to sha256 digest.
+  * @returns None
+  */
+ void sha256_single_for_mh_sha256(const uint8_t * data, uint32_t digest[]);
+
+ /*******************************************************************
+  * mh_sha256 API internal function prototypes
+  * Multiple versions of Update and Finalize functions are supplied which use
+  * multiple versions of block and tail process subfunctions.
+  ******************************************************************/
+
+ /**
+  * @brief  Tail process for multi-hash sha256.
+  *
+  * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE.
+  * It will output the final SHA256 digest based on mh_sha256_segs_digests.
+  *
+  * This function determines what instruction sets are enabled and selects the
+  * appropriate version at runtime.
+  *
+  * @param  partial_buffer Pointer to the start addr of remainder
+  * @param  total_len The total length of all sections of input data.
+  * @param  mh_sha256_segs_digests The digests of all 16 segments .
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @returns none
+  *
+  */
+ void mh_sha256_tail(uint8_t *partial_buffer, uint32_t total_len,
+			 uint32_t (*mh_sha256_segs_digests)[HASH_SEGS],
+			 uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]);
+
+ /**
+  * @brief  Tail process for multi-hash sha256.
+  *
+  * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE.
+  * It will output the final SHA256 digest based on mh_sha256_segs_digests.
+  *
+  * @param  partial_buffer Pointer to the start addr of remainder
+  * @param  total_len The total length of all sections of input data.
+  * @param  mh_sha256_segs_digests The digests of all 16 segments .
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  mh_sha256_digest mh_sha256 digest
+  * @returns none
+  *
+  */
+ void mh_sha256_tail_base(uint8_t *partial_buffer, uint32_t total_len,
+			 uint32_t (*mh_sha256_segs_digests)[HASH_SEGS],
+			 uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]);
+
+ /**
+  * @brief  Tail process for multi-hash sha256.
+  *
+  * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE.
+  * It will output the final SHA256 digest based on mh_sha256_segs_digests.
+  *
+  * @requires SSE
+  *
+  * @param  partial_buffer Pointer to the start addr of remainder
+  * @param  total_len The total length of all sections of input data.
+  * @param  mh_sha256_segs_digests The digests of all 16 segments .
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  mh_sha256_digest mh_sha256 digest
+  * @returns none
+  *
+  */
+ void mh_sha256_tail_sse(uint8_t *partial_buffer, uint32_t total_len,
+			 uint32_t (*mh_sha256_segs_digests)[HASH_SEGS],
+			 uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]);
+
+ /**
+  * @brief  Tail process for multi-hash sha256.
+  *
+  * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE.
+  * It will output the final SHA256 digest based on mh_sha256_segs_digests.
+  *
+  * @requires AVX
+  *
+  * @param  partial_buffer Pointer to the start addr of remainder
+  * @param  total_len The total length of all sections of input data.
+  * @param  mh_sha256_segs_digests The digests of all 16 segments .
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  mh_sha256_digest mh_sha256 digest
+  * @returns none
+  *
+  */
+ void mh_sha256_tail_avx(uint8_t *partial_buffer, uint32_t total_len,
+			 uint32_t (*mh_sha256_segs_digests)[HASH_SEGS],
+			 uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]);
+
+ /**
+  * @brief  Tail process for multi-hash sha256.
+  *
+  * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE.
+  * It will output the final SHA256 digest based on mh_sha256_segs_digests.
+  *
+  * @requires AVX2
+  *
+  * @param  partial_buffer Pointer to the start addr of remainder
+  * @param  total_len The total length of all sections of input data.
+  * @param  mh_sha256_segs_digests The digests of all 16 segments .
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  mh_sha256_digest mh_sha256 digest
+  * @returns none
+  *
+  */
+ void mh_sha256_tail_avx2(uint8_t *partial_buffer, uint32_t total_len,
+			 uint32_t (*mh_sha256_segs_digests)[HASH_SEGS],
+			 uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]);
+
+ /**
+  * @brief  Tail process for multi-hash sha256.
+  *
+  * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE.
+  * It will output the final SHA256 digest based on mh_sha256_segs_digests.
+  *
+  * @requires AVX512
+  *
+  * @param  partial_buffer Pointer to the start addr of remainder
+  * @param  total_len The total length of all sections of input data.
+  * @param  mh_sha256_segs_digests The digests of all 16 segments .
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  mh_sha256_digest mh_sha256 digest
+  * @returns none
+  *
+  */
+ void mh_sha256_tail_avx512(uint8_t *partial_buffer, uint32_t total_len,
+			 uint32_t (*mh_sha256_segs_digests)[HASH_SEGS],
+			 uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]);
+
+ /**
+  * @brief  Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N.
+  *
+  * This function determines what instruction sets are enabled and selects the
+  * appropriate version at runtime.
+  *
+  * @param  input_data Pointer to input data to be processed
+  * @param  digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+ void mh_sha256_block(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+			 uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+  * @brief  Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N.
+  *
+  * @param  input_data Pointer to input data to be processed
+  * @param  digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+ void mh_sha256_block_base(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+			 uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+  * @brief  Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N.
+  *
+  * @requires SSE
+  * @param  input_data Pointer to input data to be processed
+  * @param  digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+ void mh_sha256_block_sse(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+			 uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+  * @brief  Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N.
+  *
+  * @requires AVX
+  *
+  * @param  input_data Pointer to input data to be processed
+  * @param  digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+ void mh_sha256_block_avx(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+			 uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+  * @brief  Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N.
+  *
+  * @requires AVX2
+  *
+  * @param  input_data Pointer to input data to be processed
+  * @param  digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+ void mh_sha256_block_avx2(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+			 uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+  * @brief  Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N.
+  *
+  * @requires AVX512
+  *
+  * @param  input_data Pointer to input data to be processed
+  * @param  digests 16 segments digests
+  * @param  frame_buffer Pointer to buffer which is a temp working area
+  * @param  num_blocks The number of blocks.
+  * @returns none
+  *
+  */
+ void mh_sha256_block_avx512(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+			 uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_multibinary.asm
new file mode 100644
index 000000000..e14fc7eb1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_multibinary.asm
@@ -0,0 +1,77 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf32
+ [bits 32]
+%else
+ default rel
+ [bits 64]
+
+ extern mh_sha256_update_sse
+ extern mh_sha256_update_avx
+ extern mh_sha256_update_avx2
+ extern mh_sha256_finalize_sse
+ extern mh_sha256_finalize_avx
+ extern mh_sha256_finalize_avx2
+
+ %ifdef HAVE_AS_KNOWS_AVX512
+  extern mh_sha256_update_avx512
+  extern mh_sha256_finalize_avx512
+ %endif
+
+%endif
+
+extern mh_sha256_update_base
+extern mh_sha256_finalize_base
+
+mbin_interface mh_sha256_update
+mbin_interface mh_sha256_finalize
+
+%ifidn __OUTPUT_FORMAT__, elf64
+
+ %ifdef HAVE_AS_KNOWS_AVX512
+  mbin_dispatch_init6 mh_sha256_update, mh_sha256_update_base, mh_sha256_update_sse, mh_sha256_update_avx, mh_sha256_update_avx2, mh_sha256_update_avx512
+  mbin_dispatch_init6 mh_sha256_finalize, mh_sha256_finalize_base, mh_sha256_finalize_sse, mh_sha256_finalize_avx, mh_sha256_finalize_avx2, mh_sha256_finalize_avx512
+ %else
+  mbin_dispatch_init5 mh_sha256_update, mh_sha256_update_base, mh_sha256_update_sse, mh_sha256_update_avx, mh_sha256_update_avx2
+  mbin_dispatch_init5 mh_sha256_finalize, mh_sha256_finalize_base, mh_sha256_finalize_sse, mh_sha256_finalize_avx, mh_sha256_finalize_avx2
+ %endif
+
+%else
+ mbin_dispatch_init2 mh_sha256_update, mh_sha256_update_base
+ mbin_dispatch_init2 mh_sha256_finalize, mh_sha256_finalize_base
+%endif
+
+;;;       func                 				core, ver, snum
+slversion mh_sha256_update,				00, 00, 02b2
+slversion mh_sha256_finalize,				00, 00, 02b3
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c
new file mode 100644
index 000000000..8095e4f05
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c
@@ -0,0 +1,180 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha256.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Loop many times over same
+# define TEST_LEN     16*1024
+# define TEST_LOOPS   20000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+# define TEST_LEN     16*1024*1024
+# define TEST_LOOPS   100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#define TEST_MEM   TEST_LEN
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type)		func##type
+#define FUNC_TOKEN(func, type)		_FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA256_FUNC_TYPE
+#define	MH_SHA256_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION		FUNC_TOKEN(mh_sha256_update, MH_SHA256_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION		FUNC_TOKEN(mh_sha256_finalize, MH_SHA256_FUNC_TYPE)
+
+#define CHECK_RETURN(state)		do{ \
+					  if((state) != MH_SHA256_CTX_ERROR_NONE){ \
+					    printf("The mh_sha256 function is failed.\n"); \
+					    return 1; \
+					  } \
+					}while(0)
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	if (i % 32 != 0)
+		printf("\n");
+}
+
+int compare_digests(uint32_t hash_base[SHA256_DIGEST_WORDS],
+		    uint32_t hash_test[SHA256_DIGEST_WORDS])
+{
+	int i;
+	int mh_sha256_fail = 0;
+
+	for (i = 0; i < SHA256_DIGEST_WORDS; i++) {
+		if (hash_test[i] != hash_base[i])
+			mh_sha256_fail++;
+	}
+
+	if (mh_sha256_fail) {
+		printf("mh_sha256 fail test\n");
+		printf("base: ");
+		dump((char *)hash_base, 32);
+		printf("ref: ");
+		dump((char *)hash_test, 32);
+	}
+
+	return mh_sha256_fail;
+}
+
+int main(int argc, char *argv[])
+{
+	int i, fail = 0;
+	uint32_t hash_test[SHA256_DIGEST_WORDS], hash_base[SHA256_DIGEST_WORDS];
+	uint8_t *buff = NULL;
+	struct mh_sha256_ctx *update_ctx_test = NULL, *update_ctx_base = NULL;
+	struct perf start, stop;
+
+	printf(xstr(TEST_UPDATE_FUNCTION) "_perf:\n");
+
+	buff = malloc(TEST_LEN);
+	update_ctx_test = malloc(sizeof(*update_ctx_test));
+	update_ctx_base = malloc(sizeof(*update_ctx_base));
+
+	if (buff == NULL || update_ctx_base == NULL || update_ctx_test == NULL) {
+		printf("malloc failed test aborted\n");
+		return -1;
+	}
+	// Rand test1
+	rand_buffer(buff, TEST_LEN);
+
+	// mh_sha256 base version
+	mh_sha256_init(update_ctx_base);
+	mh_sha256_update_base(update_ctx_base, buff, TEST_LEN);
+	mh_sha256_finalize_base(update_ctx_base, hash_base);
+
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS / 10; i++) {
+		mh_sha256_init(update_ctx_base);
+		mh_sha256_update_base(update_ctx_base, buff, TEST_LEN);
+		mh_sha256_finalize_base(update_ctx_base, hash_base);
+	}
+	perf_stop(&stop);
+	printf("mh_sha256_update_base" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_MEM * i);
+
+	//Update feature test
+	CHECK_RETURN(mh_sha256_init(update_ctx_test));
+	CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx_test, buff, TEST_LEN));
+	CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx_test, hash_test));
+
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++) {
+		CHECK_RETURN(mh_sha256_init(update_ctx_test));
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx_test, buff, TEST_LEN));
+		CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx_test, hash_test));
+	}
+	perf_stop(&stop);
+	printf(xstr(TEST_UPDATE_FUNCTION) TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_MEM * i);
+
+	// Check results
+	fail = compare_digests(hash_base, hash_test);
+
+	if (fail) {
+		printf("Fail size=%d\n", TEST_LEN);
+		return -1;
+	}
+
+	if (fail)
+		printf("Test failed function test%d\n", fail);
+	else
+		printf("Pass func check\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c
new file mode 100644
index 000000000..2aaefecb0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c
@@ -0,0 +1,410 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha256_internal.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+ //  Macros and sub-functions which already exist in source code file
+ //  (sha256_for_mh_sha256.c) is part of ISA-L library as internal functions.
+ //  The reason why writing them twice is the linking issue caused by
+ //  mh_sha256_ref(). mh_sha256_ref() needs these macros and sub-functions
+ //  without linking ISA-L library. So mh_sha256_ref() includes them in
+ //  order to contain essential sub-functions in its own object file.
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#define W(x) w[(x) & 15]
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+	if (i<16) W(i) = to_be32(ww[i]); \
+	else \
+	W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+	t2 = s0(a) + maj(a,b,c); \
+	t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+	d += t1; \
+	h = t1 + t2;
+
+void sha256_single_for_mh_sha256_ref(const uint8_t * data, uint32_t digest[])
+{
+	uint32_t a, b, c, d, e, f, g, h, t1, t2;
+	uint32_t w[16];
+	uint32_t *ww = (uint32_t *) data;
+
+	a = digest[0];
+	b = digest[1];
+	c = digest[2];
+	d = digest[3];
+	e = digest[4];
+	f = digest[5];
+	g = digest[6];
+	h = digest[7];
+
+	step(0, a, b, c, d, e, f, g, h, 0x428a2f98);
+	step(1, h, a, b, c, d, e, f, g, 0x71374491);
+	step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf);
+	step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5);
+	step(4, e, f, g, h, a, b, c, d, 0x3956c25b);
+	step(5, d, e, f, g, h, a, b, c, 0x59f111f1);
+	step(6, c, d, e, f, g, h, a, b, 0x923f82a4);
+	step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5);
+	step(8, a, b, c, d, e, f, g, h, 0xd807aa98);
+	step(9, h, a, b, c, d, e, f, g, 0x12835b01);
+	step(10, g, h, a, b, c, d, e, f, 0x243185be);
+	step(11, f, g, h, a, b, c, d, e, 0x550c7dc3);
+	step(12, e, f, g, h, a, b, c, d, 0x72be5d74);
+	step(13, d, e, f, g, h, a, b, c, 0x80deb1fe);
+	step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7);
+	step(15, b, c, d, e, f, g, h, a, 0xc19bf174);
+	step(16, a, b, c, d, e, f, g, h, 0xe49b69c1);
+	step(17, h, a, b, c, d, e, f, g, 0xefbe4786);
+	step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6);
+	step(19, f, g, h, a, b, c, d, e, 0x240ca1cc);
+	step(20, e, f, g, h, a, b, c, d, 0x2de92c6f);
+	step(21, d, e, f, g, h, a, b, c, 0x4a7484aa);
+	step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc);
+	step(23, b, c, d, e, f, g, h, a, 0x76f988da);
+	step(24, a, b, c, d, e, f, g, h, 0x983e5152);
+	step(25, h, a, b, c, d, e, f, g, 0xa831c66d);
+	step(26, g, h, a, b, c, d, e, f, 0xb00327c8);
+	step(27, f, g, h, a, b, c, d, e, 0xbf597fc7);
+	step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3);
+	step(29, d, e, f, g, h, a, b, c, 0xd5a79147);
+	step(30, c, d, e, f, g, h, a, b, 0x06ca6351);
+	step(31, b, c, d, e, f, g, h, a, 0x14292967);
+	step(32, a, b, c, d, e, f, g, h, 0x27b70a85);
+	step(33, h, a, b, c, d, e, f, g, 0x2e1b2138);
+	step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc);
+	step(35, f, g, h, a, b, c, d, e, 0x53380d13);
+	step(36, e, f, g, h, a, b, c, d, 0x650a7354);
+	step(37, d, e, f, g, h, a, b, c, 0x766a0abb);
+	step(38, c, d, e, f, g, h, a, b, 0x81c2c92e);
+	step(39, b, c, d, e, f, g, h, a, 0x92722c85);
+	step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1);
+	step(41, h, a, b, c, d, e, f, g, 0xa81a664b);
+	step(42, g, h, a, b, c, d, e, f, 0xc24b8b70);
+	step(43, f, g, h, a, b, c, d, e, 0xc76c51a3);
+	step(44, e, f, g, h, a, b, c, d, 0xd192e819);
+	step(45, d, e, f, g, h, a, b, c, 0xd6990624);
+	step(46, c, d, e, f, g, h, a, b, 0xf40e3585);
+	step(47, b, c, d, e, f, g, h, a, 0x106aa070);
+	step(48, a, b, c, d, e, f, g, h, 0x19a4c116);
+	step(49, h, a, b, c, d, e, f, g, 0x1e376c08);
+	step(50, g, h, a, b, c, d, e, f, 0x2748774c);
+	step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5);
+	step(52, e, f, g, h, a, b, c, d, 0x391c0cb3);
+	step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a);
+	step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f);
+	step(55, b, c, d, e, f, g, h, a, 0x682e6ff3);
+	step(56, a, b, c, d, e, f, g, h, 0x748f82ee);
+	step(57, h, a, b, c, d, e, f, g, 0x78a5636f);
+	step(58, g, h, a, b, c, d, e, f, 0x84c87814);
+	step(59, f, g, h, a, b, c, d, e, 0x8cc70208);
+	step(60, e, f, g, h, a, b, c, d, 0x90befffa);
+	step(61, d, e, f, g, h, a, b, c, 0xa4506ceb);
+	step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7);
+	step(63, b, c, d, e, f, g, h, a, 0xc67178f2);
+
+	digest[0] += a;
+	digest[1] += b;
+	digest[2] += c;
+	digest[3] += d;
+	digest[4] += e;
+	digest[5] += f;
+	digest[6] += g;
+	digest[7] += h;
+}
+
+void sha256_for_mh_sha256_ref(const uint8_t * input_data, uint32_t * digest,
+			      const uint32_t len)
+{
+	uint32_t i, j;
+	uint8_t buf[2 * SHA256_BLOCK_SIZE];
+
+	digest[0] = MH_SHA256_H0;
+	digest[1] = MH_SHA256_H1;
+	digest[2] = MH_SHA256_H2;
+	digest[3] = MH_SHA256_H3;
+	digest[4] = MH_SHA256_H4;
+	digest[5] = MH_SHA256_H5;
+	digest[6] = MH_SHA256_H6;
+	digest[7] = MH_SHA256_H7;
+
+	i = len;
+	while (i >= SHA256_BLOCK_SIZE) {
+		sha256_single_for_mh_sha256_ref(input_data, digest);
+		input_data += SHA256_BLOCK_SIZE;
+		i -= SHA256_BLOCK_SIZE;
+	}
+
+	memcpy(buf, input_data, i);
+	buf[i++] = 0x80;
+	for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - 8); j++)
+		buf[j] = 0;
+
+	if (i > SHA256_BLOCK_SIZE - 8)
+		i = 2 * SHA256_BLOCK_SIZE;
+	else
+		i = SHA256_BLOCK_SIZE;
+
+	*(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+	sha256_single_for_mh_sha256_ref(buf, digest);
+	if (i == (2 * SHA256_BLOCK_SIZE))
+		sha256_single_for_mh_sha256_ref(buf + SHA256_BLOCK_SIZE, digest);
+}
+
+/*
+ * buffer to rearrange one segment data from one block.
+ *
+ * Layout of new_data:
+ *  segment
+ *  -------------------------
+ *   w0  |  w1  | ... |  w15
+ *
+ */
+static inline void transform_input_single(uint32_t * new_data, uint32_t * input,
+					  uint32_t segment)
+{
+	new_data[16 * segment + 0] = input[16 * 0 + segment];
+	new_data[16 * segment + 1] = input[16 * 1 + segment];
+	new_data[16 * segment + 2] = input[16 * 2 + segment];
+	new_data[16 * segment + 3] = input[16 * 3 + segment];
+	new_data[16 * segment + 4] = input[16 * 4 + segment];
+	new_data[16 * segment + 5] = input[16 * 5 + segment];
+	new_data[16 * segment + 6] = input[16 * 6 + segment];
+	new_data[16 * segment + 7] = input[16 * 7 + segment];
+	new_data[16 * segment + 8] = input[16 * 8 + segment];
+	new_data[16 * segment + 9] = input[16 * 9 + segment];
+	new_data[16 * segment + 10] = input[16 * 10 + segment];
+	new_data[16 * segment + 11] = input[16 * 11 + segment];
+	new_data[16 * segment + 12] = input[16 * 12 + segment];
+	new_data[16 * segment + 13] = input[16 * 13 + segment];
+	new_data[16 * segment + 14] = input[16 * 14 + segment];
+	new_data[16 * segment + 15] = input[16 * 15 + segment];
+}
+
+// Adapt parameters to sha256_single_for_mh_sha256_ref
+#define sha256_update_one_seg(data, digest) \
+	sha256_single_for_mh_sha256_ref((const uint8_t *)(data), (uint32_t *)(digest))
+
+/*
+ * buffer to Rearrange all segments data from one block.
+ *
+ * Layout of new_data:
+ *  segment
+ *  -------------------------
+ *   seg0:   | w0  |  w1  | ... |  w15
+ *   seg1:   | w0  |  w1  | ... |  w15
+ *   seg2:   | w0  |  w1  | ... |  w15
+ *   ....
+ *   seg15: | w0  |  w1  | ... |  w15
+ *
+ */
+static inline void transform_input(uint32_t * new_data, uint32_t * input, uint32_t block)
+{
+	uint32_t *current_input = input + block * MH_SHA256_BLOCK_SIZE / 4;
+
+	transform_input_single(new_data, current_input, 0);
+	transform_input_single(new_data, current_input, 1);
+	transform_input_single(new_data, current_input, 2);
+	transform_input_single(new_data, current_input, 3);
+	transform_input_single(new_data, current_input, 4);
+	transform_input_single(new_data, current_input, 5);
+	transform_input_single(new_data, current_input, 6);
+	transform_input_single(new_data, current_input, 7);
+	transform_input_single(new_data, current_input, 8);
+	transform_input_single(new_data, current_input, 9);
+	transform_input_single(new_data, current_input, 10);
+	transform_input_single(new_data, current_input, 11);
+	transform_input_single(new_data, current_input, 12);
+	transform_input_single(new_data, current_input, 13);
+	transform_input_single(new_data, current_input, 14);
+	transform_input_single(new_data, current_input, 15);
+
+}
+
+/*
+ * buffer to Calculate all segments' digests from one block.
+ *
+ * Layout of seg_digest:
+ *  segment
+ *  -------------------------
+ *   seg0:   | H0  |  H1  | ... |  H7
+ *   seg1:   | H0  |  H1  | ... |  H7
+ *   seg2:   | H0  |  H1  | ... |  H7
+ *   ....
+ *   seg15: | H0  |  H1  | ... |  H7
+ *
+ */
+static inline void sha256_update_all_segs(uint32_t * new_data, uint32_t(*mh_sha256_seg_digests)
+					  [SHA256_DIGEST_WORDS])
+{
+	sha256_update_one_seg(&(new_data)[16 * 0], mh_sha256_seg_digests[0]);
+	sha256_update_one_seg(&(new_data)[16 * 1], mh_sha256_seg_digests[1]);
+	sha256_update_one_seg(&(new_data)[16 * 2], mh_sha256_seg_digests[2]);
+	sha256_update_one_seg(&(new_data)[16 * 3], mh_sha256_seg_digests[3]);
+	sha256_update_one_seg(&(new_data)[16 * 4], mh_sha256_seg_digests[4]);
+	sha256_update_one_seg(&(new_data)[16 * 5], mh_sha256_seg_digests[5]);
+	sha256_update_one_seg(&(new_data)[16 * 6], mh_sha256_seg_digests[6]);
+	sha256_update_one_seg(&(new_data)[16 * 7], mh_sha256_seg_digests[7]);
+	sha256_update_one_seg(&(new_data)[16 * 8], mh_sha256_seg_digests[8]);
+	sha256_update_one_seg(&(new_data)[16 * 9], mh_sha256_seg_digests[9]);
+	sha256_update_one_seg(&(new_data)[16 * 10], mh_sha256_seg_digests[10]);
+	sha256_update_one_seg(&(new_data)[16 * 11], mh_sha256_seg_digests[11]);
+	sha256_update_one_seg(&(new_data)[16 * 12], mh_sha256_seg_digests[12]);
+	sha256_update_one_seg(&(new_data)[16 * 13], mh_sha256_seg_digests[13]);
+	sha256_update_one_seg(&(new_data)[16 * 14], mh_sha256_seg_digests[14]);
+	sha256_update_one_seg(&(new_data)[16 * 15], mh_sha256_seg_digests[15]);
+}
+
+void mh_sha256_block_ref(const uint8_t * input_data, uint32_t(*digests)[HASH_SEGS],
+			 uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks)
+{
+	uint32_t i, j;
+	uint32_t *temp_buffer = (uint32_t *) frame_buffer;
+	uint32_t(*trans_digests)[SHA256_DIGEST_WORDS];
+
+	trans_digests = (uint32_t(*)[SHA256_DIGEST_WORDS]) digests;
+
+	// Re-structure seg_digests from 5*16 to 16*5
+	for (j = 0; j < HASH_SEGS; j++) {
+		for (i = 0; i < SHA256_DIGEST_WORDS; i++) {
+			temp_buffer[j * SHA256_DIGEST_WORDS + i] = digests[i][j];
+		}
+	}
+	memcpy(trans_digests, temp_buffer, 4 * SHA256_DIGEST_WORDS * HASH_SEGS);
+
+	// Calculate digests for all segments, leveraging sha256 API
+	for (i = 0; i < num_blocks; i++) {
+		transform_input(temp_buffer, (uint32_t *) input_data, i);
+		sha256_update_all_segs(temp_buffer, trans_digests);
+	}
+
+	// Re-structure seg_digests from 16*5 to 5*16
+	for (j = 0; j < HASH_SEGS; j++) {
+		for (i = 0; i < SHA256_DIGEST_WORDS; i++) {
+			temp_buffer[i * HASH_SEGS + j] = trans_digests[j][i];
+		}
+	}
+	memcpy(digests, temp_buffer, 4 * SHA256_DIGEST_WORDS * HASH_SEGS);
+
+	return;
+}
+
+void mh_sha256_tail_ref(uint8_t * partial_buffer, uint32_t total_len,
+			uint32_t(*mh_sha256_segs_digests)[HASH_SEGS], uint8_t * frame_buffer,
+			uint32_t digests[SHA256_DIGEST_WORDS])
+{
+	uint64_t partial_buffer_len, len_in_bit;
+
+	partial_buffer_len = total_len % MH_SHA256_BLOCK_SIZE;
+
+	// Padding the first block
+	partial_buffer[partial_buffer_len] = 0x80;
+	partial_buffer_len++;
+	memset(partial_buffer + partial_buffer_len, 0,
+	       MH_SHA256_BLOCK_SIZE - partial_buffer_len);
+
+	// Calculate the first block without total_length if padding needs 2 block
+	if (partial_buffer_len > (MH_SHA256_BLOCK_SIZE - 8)) {
+		mh_sha256_block_ref(partial_buffer, mh_sha256_segs_digests, frame_buffer, 1);
+		//Padding the second block
+		memset(partial_buffer, 0, MH_SHA256_BLOCK_SIZE);
+	}
+	//Padding the block
+	len_in_bit = to_be64((uint64_t) total_len * 8);
+	*(uint64_t *) (partial_buffer + MH_SHA256_BLOCK_SIZE - 8) = len_in_bit;
+	mh_sha256_block_ref(partial_buffer, mh_sha256_segs_digests, frame_buffer, 1);
+
+	//Calculate multi-hash SHA256 digests (segment digests as input message)
+	sha256_for_mh_sha256_ref((uint8_t *) mh_sha256_segs_digests, digests,
+				 4 * SHA256_DIGEST_WORDS * HASH_SEGS);
+
+	return;
+}
+
+void mh_sha256_ref(const void *buffer, uint32_t len, uint32_t * mh_sha256_digest)
+{
+	uint64_t total_len;
+	uint64_t num_blocks;
+	uint32_t mh_sha256_segs_digests[SHA256_DIGEST_WORDS][HASH_SEGS];
+	uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE];
+	uint8_t partial_block_buffer[MH_SHA256_BLOCK_SIZE * 2];
+	uint32_t mh_sha256_hash_dword[SHA256_DIGEST_WORDS];
+	uint32_t i;
+	const uint8_t *input_data = (const uint8_t *)buffer;
+
+	/* Initialize digests of all segments */
+	for (i = 0; i < HASH_SEGS; i++) {
+		mh_sha256_segs_digests[0][i] = MH_SHA256_H0;
+		mh_sha256_segs_digests[1][i] = MH_SHA256_H1;
+		mh_sha256_segs_digests[2][i] = MH_SHA256_H2;
+		mh_sha256_segs_digests[3][i] = MH_SHA256_H3;
+		mh_sha256_segs_digests[4][i] = MH_SHA256_H4;
+		mh_sha256_segs_digests[5][i] = MH_SHA256_H5;
+		mh_sha256_segs_digests[6][i] = MH_SHA256_H6;
+		mh_sha256_segs_digests[7][i] = MH_SHA256_H7;
+	}
+
+	total_len = len;
+
+	// Calculate blocks
+	num_blocks = len / MH_SHA256_BLOCK_SIZE;
+	if (num_blocks > 0) {
+		//do num_blocks process
+		mh_sha256_block_ref(input_data, mh_sha256_segs_digests, frame_buffer,
+				    num_blocks);
+		len -= num_blocks * MH_SHA256_BLOCK_SIZE;
+		input_data += num_blocks * MH_SHA256_BLOCK_SIZE;
+	}
+	// Store the partial block
+	if (len != 0) {
+		memcpy(partial_block_buffer, input_data, len);
+	}
+
+	/* Finalize */
+	mh_sha256_tail_ref(partial_block_buffer, total_len, mh_sha256_segs_digests,
+			   frame_buffer, mh_sha256_hash_dword);
+
+	// Output the digests of mh_sha256
+	if (mh_sha256_digest != NULL) {
+		mh_sha256_digest[0] = mh_sha256_hash_dword[0];
+		mh_sha256_digest[1] = mh_sha256_hash_dword[1];
+		mh_sha256_digest[2] = mh_sha256_hash_dword[2];
+		mh_sha256_digest[3] = mh_sha256_hash_dword[3];
+		mh_sha256_digest[4] = mh_sha256_hash_dword[4];
+		mh_sha256_digest[5] = mh_sha256_hash_dword[5];
+		mh_sha256_digest[6] = mh_sha256_hash_dword[6];
+		mh_sha256_digest[7] = mh_sha256_hash_dword[7];
+	}
+
+	return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c
new file mode 100644
index 000000000..13ab91c16
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c
@@ -0,0 +1,217 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha256.h"
+
+#define TEST_LEN   16*1024
+#define TEST_SIZE   8*1024
+#define TEST_MEM   TEST_LEN
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type)		func##type
+#define FUNC_TOKEN(func, type)		_FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA256_FUNC_TYPE
+#define	MH_SHA256_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION		FUNC_TOKEN(mh_sha256_update, MH_SHA256_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION		FUNC_TOKEN(mh_sha256_finalize, MH_SHA256_FUNC_TYPE)
+
+#define CHECK_RETURN(state)		do{ \
+					  if((state) != MH_SHA256_CTX_ERROR_NONE){ \
+					    printf("The mh_sha256 function is failed.\n"); \
+					    return 1; \
+					  } \
+					}while(0)
+
+extern void mh_sha256_ref(const void *buffer, uint32_t len, uint32_t * mh_sha256_digest);
+#define MH_SHA256_REF	mh_sha256_ref
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	if (i % 32 != 0)
+		printf("\n");
+}
+
+int compare_digests(uint32_t hash_ref[SHA256_DIGEST_WORDS],
+		    uint32_t hash_test[SHA256_DIGEST_WORDS])
+{
+	int i;
+	int mh_sha256_fail = 0;
+
+	for (i = 0; i < SHA256_DIGEST_WORDS; i++) {
+		if (hash_test[i] != hash_ref[i])
+			mh_sha256_fail++;
+	}
+
+	if (mh_sha256_fail) {
+		printf("mh_sha256 fail test\n");
+		printf("ref: ");
+		dump((char *)hash_ref, 32);
+		printf("test: ");
+		dump((char *)hash_test, 32);
+	}
+
+	return mh_sha256_fail;
+}
+
+int main(int argc, char *argv[])
+{
+	int fail = 0;
+	uint32_t hash_test[SHA256_DIGEST_WORDS], hash_ref[SHA256_DIGEST_WORDS];
+	uint8_t *buff = NULL;
+	int size, offset;
+	struct mh_sha256_ctx *update_ctx = NULL;
+
+	printf(xstr(TEST_UPDATE_FUNCTION) "_test:\n");
+
+	srand(TEST_SEED);
+
+	buff = malloc(TEST_LEN);
+	update_ctx = malloc(sizeof(*update_ctx));
+
+	if (buff == NULL || update_ctx == NULL) {
+		printf("malloc failed test aborted\n");
+		return -1;
+	}
+	// Rand test1
+	rand_buffer(buff, TEST_LEN);
+
+	MH_SHA256_REF(buff, TEST_LEN, hash_ref);
+	CHECK_RETURN(mh_sha256_init(update_ctx));
+	CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+	CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+	fail = compare_digests(hash_ref, hash_test);
+
+	if (fail) {
+		printf("fail rand1 test\n");
+		return -1;
+	} else
+		putchar('.');
+
+	// Test various size messages
+	for (size = TEST_LEN; size >= 0; size--) {
+
+		// Fill with rand data
+		rand_buffer(buff, size);
+
+		MH_SHA256_REF(buff, size, hash_ref);
+		CHECK_RETURN(mh_sha256_init(update_ctx));
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size));
+		CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+		fail = compare_digests(hash_ref, hash_test);
+
+		if (fail) {
+			printf("Fail size=%d\n", size);
+			return -1;
+		}
+
+		if ((size & 0xff) == 0) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	// Test various buffer offsets and sizes
+	printf("offset tests");
+	for (size = TEST_LEN - 256; size > 256; size -= 11) {
+		for (offset = 0; offset < 256; offset++) {
+			MH_SHA256_REF(buff + offset, size, hash_ref);
+
+			CHECK_RETURN(mh_sha256_init(update_ctx));
+			CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size));
+			CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+			fail = compare_digests(hash_ref, hash_test);
+
+			if (fail) {
+				printf("Fail size=%d\n", size);
+				return -1;
+			}
+
+		}
+		if ((size & 0xf) == 0) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	// Run efence tests
+	printf("efence tests");
+	for (size = TEST_SIZE; size > 0; size--) {
+		offset = TEST_LEN - size;
+
+		MH_SHA256_REF(buff + offset, size, hash_ref);
+
+		CHECK_RETURN(mh_sha256_init(update_ctx));
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size));
+		CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+		fail = compare_digests(hash_ref, hash_test);
+
+		if (fail) {
+			printf("Fail size=%d\n", size);
+			return -1;
+		}
+
+		if ((size & 0xf) == 0) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	printf(xstr(TEST_UPDATE_FUNCTION) "_test:");
+	printf(" %s\n", fail == 0 ? "Pass" : "Fail");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_base.c
new file mode 100644
index 000000000..024ae2b91
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_base.c
@@ -0,0 +1,110 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/*
+ * mh_sha256_update_base.c contains the prototype of mh_sha256_update_XXX.
+ * Default definitions are base type which generates mh_sha256_update_base.
+ * Other types are generated through different predefined macros by mh_sha256.c.
+ */
+#ifndef MH_SHA256_UPDATE_FUNCTION
+#include "mh_sha256_internal.h"
+#include <string.h>
+
+#define MH_SHA256_UPDATE_FUNCTION	mh_sha256_update_base
+#define MH_SHA256_BLOCK_FUNCTION	mh_sha256_block_base
+#define MH_SHA256_UPDATE_SLVER
+#endif
+
+int MH_SHA256_UPDATE_FUNCTION(struct mh_sha256_ctx *ctx, const void *buffer, uint32_t len)
+{
+
+	uint8_t *partial_block_buffer;
+	uint64_t partial_block_len;
+	uint64_t num_blocks;
+	uint32_t(*mh_sha256_segs_digests)[HASH_SEGS];
+	uint8_t *aligned_frame_buffer;
+	const uint8_t *input_data = (const uint8_t *)buffer;
+
+	if (ctx == NULL)
+		return MH_SHA256_CTX_ERROR_NULL;
+
+	if (len == 0)
+		return MH_SHA256_CTX_ERROR_NONE;
+
+	partial_block_len = ctx->total_length % MH_SHA256_BLOCK_SIZE;
+	partial_block_buffer = ctx->partial_block_buffer;
+	aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer);
+	mh_sha256_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha256_interim_digests;
+
+	ctx->total_length += len;
+	// No enough input data for mh_sha256 calculation
+	if (len + partial_block_len < MH_SHA256_BLOCK_SIZE) {
+		memcpy(partial_block_buffer + partial_block_len, input_data, len);
+		return MH_SHA256_CTX_ERROR_NONE;
+	}
+	// mh_sha256 calculation for the previous partial block
+	if (partial_block_len != 0) {
+		memcpy(partial_block_buffer + partial_block_len, input_data,
+		       MH_SHA256_BLOCK_SIZE - partial_block_len);
+		//do one_block process
+		MH_SHA256_BLOCK_FUNCTION(partial_block_buffer, mh_sha256_segs_digests,
+					 aligned_frame_buffer, 1);
+		input_data += MH_SHA256_BLOCK_SIZE - partial_block_len;
+		len -= MH_SHA256_BLOCK_SIZE - partial_block_len;
+		memset(partial_block_buffer, 0, MH_SHA256_BLOCK_SIZE);
+	}
+	// Calculate mh_sha256 for the current blocks
+	num_blocks = len / MH_SHA256_BLOCK_SIZE;
+	if (num_blocks > 0) {
+		//do num_blocks process
+		MH_SHA256_BLOCK_FUNCTION(input_data, mh_sha256_segs_digests,
+					 aligned_frame_buffer, num_blocks);
+		len -= num_blocks * MH_SHA256_BLOCK_SIZE;
+		input_data += num_blocks * MH_SHA256_BLOCK_SIZE;
+	}
+	// Store the partial block
+	if (len != 0) {
+		memcpy(partial_block_buffer, input_data, len);
+	}
+
+	return MH_SHA256_CTX_ERROR_NONE;
+
+}
+
+#ifdef MH_SHA256_UPDATE_SLVER
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+
+// Version info
+struct slver mh_sha256_update_base_slver_000002ba;
+struct slver mh_sha256_update_base_slver = { 0x02ba, 0x00, 0x00 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_test.c
new file mode 100644
index 000000000..f5b28bba7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_test.c
@@ -0,0 +1,240 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha256.h"
+
+#define TEST_LEN   16*1024
+#define TEST_SIZE   8*1024
+#define TEST_MEM   TEST_LEN
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type)		func##type
+#define FUNC_TOKEN(func, type)		_FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA256_FUNC_TYPE
+#define	MH_SHA256_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION		FUNC_TOKEN(mh_sha256_update, MH_SHA256_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION		FUNC_TOKEN(mh_sha256_finalize, MH_SHA256_FUNC_TYPE)
+
+#define CHECK_RETURN(state)		do{ \
+					  if((state) != MH_SHA256_CTX_ERROR_NONE){ \
+					    printf("The mh_sha256 function is failed.\n"); \
+					    return 1; \
+					  } \
+					}while(0)
+
+extern void mh_sha256_ref(const void *buffer, uint32_t len, uint32_t * mh_sha256_digest);
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 20 == 0)
+			printf("\n");
+	}
+	if (i % 20 != 0)
+		printf("\n");
+}
+
+int compare_digests(uint32_t hash_ref[SHA256_DIGEST_WORDS],
+		    uint32_t hash_test[SHA256_DIGEST_WORDS])
+{
+	int i;
+	int mh_sha256_fail = 0;
+
+	for (i = 0; i < SHA256_DIGEST_WORDS; i++) {
+		if (hash_test[i] != hash_ref[i])
+			mh_sha256_fail++;
+	}
+
+	if (mh_sha256_fail) {
+		printf("mh_sha256 fail test\n");
+		printf("ref: ");
+		dump((char *)hash_ref, 20);
+		printf("test: ");
+		dump((char *)hash_test, 20);
+	}
+
+	return mh_sha256_fail;
+}
+
+int main(int argc, char *argv[])
+{
+	int fail = 0, i;
+	uint32_t hash_test[SHA256_DIGEST_WORDS], hash_ref[SHA256_DIGEST_WORDS];
+	uint8_t *buff = NULL;
+	int update_count;
+	int size1, size2, offset, addr_offset;
+	struct mh_sha256_ctx *update_ctx = NULL;
+	uint8_t *mem_addr = NULL;
+
+	printf(xstr(TEST_UPDATE_FUNCTION) "_test:");
+
+	srand(TEST_SEED);
+
+	buff = malloc(TEST_LEN);
+	update_ctx = malloc(sizeof(*update_ctx));
+
+	if (buff == NULL || update_ctx == NULL) {
+		printf("malloc failed test aborted\n");
+		return -1;
+	}
+	// Rand test1
+	rand_buffer(buff, TEST_LEN);
+
+	mh_sha256_ref(buff, TEST_LEN, hash_ref);
+
+	CHECK_RETURN(mh_sha256_init(update_ctx));
+	CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+	CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+	fail = compare_digests(hash_ref, hash_test);
+
+	if (fail) {
+		printf("fail rand1 test\n");
+		return -1;
+	} else
+		putchar('.');
+
+	// Test various size messages by update twice.
+	printf("\n various size messages by update twice tests");
+	for (size1 = TEST_LEN; size1 >= 0; size1--) {
+
+		// Fill with rand data
+		rand_buffer(buff, TEST_LEN);
+
+		mh_sha256_ref(buff, TEST_LEN, hash_ref);
+
+		// subsequent update
+		size2 = TEST_LEN - size1;	// size2 is different with the former
+		CHECK_RETURN(mh_sha256_init(update_ctx));
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size1));
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + size1, size2));
+		CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+		fail = compare_digests(hash_ref, hash_test);
+
+		if (fail) {
+			printf("Fail size1=%d\n", size1);
+			return -1;
+		}
+
+		if ((size2 & 0xff) == 0) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	// Test various update count
+	printf("\n various update count tests");
+	for (update_count = 1; update_count <= TEST_LEN; update_count++) {
+
+		// Fill with rand data
+		rand_buffer(buff, TEST_LEN);
+
+		mh_sha256_ref(buff, TEST_LEN, hash_ref);
+
+		// subsequent update
+		size1 = TEST_LEN / update_count;
+		size2 = TEST_LEN - size1 * (update_count - 1);	// size2 is different with the former
+
+		CHECK_RETURN(mh_sha256_init(update_ctx));
+		for (i = 1, offset = 0; i < update_count; i++) {
+			CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size1));
+			offset += size1;
+		}
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size2));
+		CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+		fail = compare_digests(hash_ref, hash_test);
+
+		if (fail) {
+			printf("Fail size1=%d\n", size1);
+			return -1;
+		}
+
+		if ((size2 & 0xff) == 0) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	// test various start address of ctx.
+	printf("\n various start address of ctx test");
+	free(update_ctx);
+	mem_addr = (uint8_t *) malloc(sizeof(*update_ctx) + AVX512_ALIGNED * 10);
+	for (addr_offset = AVX512_ALIGNED * 10; addr_offset >= 0; addr_offset--) {
+
+		// Fill with rand data
+		rand_buffer(buff, TEST_LEN);
+
+		mh_sha256_ref(buff, TEST_LEN, hash_ref);
+
+		// a unaligned offset
+		update_ctx = (struct mh_sha256_ctx *)(mem_addr + addr_offset);
+		CHECK_RETURN(mh_sha256_init(update_ctx));
+		CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+		CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+		fail = compare_digests(hash_ref, hash_test);
+
+		if (fail) {
+			printf("Fail addr_offset=%d\n", addr_offset);
+			return -1;
+		}
+
+		if ((addr_offset & 0xf) == 0) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	printf("\n" xstr(TEST_UPDATE_FUNCTION) "_test: %s\n", fail == 0 ? "Pass" : "Fail");
+
+	return fail;
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c
new file mode 100644
index 000000000..ea8c9f436
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c
@@ -0,0 +1,176 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "mh_sha256_internal.h"
+#include <string.h>
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA256 Functions for mh_sha256
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#define W(x) w[(x) & 15]
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+	if (i<16) W(i) = to_be32(ww[i]); \
+	else \
+	W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+	t2 = s0(a) + maj(a,b,c); \
+	t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+	d += t1; \
+	h = t1 + t2;
+
+void sha256_single_for_mh_sha256(const uint8_t * data, uint32_t digest[])
+{
+	uint32_t a, b, c, d, e, f, g, h, t1, t2;
+	uint32_t w[16];
+	uint32_t *ww = (uint32_t *) data;
+
+	a = digest[0];
+	b = digest[1];
+	c = digest[2];
+	d = digest[3];
+	e = digest[4];
+	f = digest[5];
+	g = digest[6];
+	h = digest[7];
+
+	step(0, a, b, c, d, e, f, g, h, 0x428a2f98);
+	step(1, h, a, b, c, d, e, f, g, 0x71374491);
+	step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf);
+	step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5);
+	step(4, e, f, g, h, a, b, c, d, 0x3956c25b);
+	step(5, d, e, f, g, h, a, b, c, 0x59f111f1);
+	step(6, c, d, e, f, g, h, a, b, 0x923f82a4);
+	step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5);
+	step(8, a, b, c, d, e, f, g, h, 0xd807aa98);
+	step(9, h, a, b, c, d, e, f, g, 0x12835b01);
+	step(10, g, h, a, b, c, d, e, f, 0x243185be);
+	step(11, f, g, h, a, b, c, d, e, 0x550c7dc3);
+	step(12, e, f, g, h, a, b, c, d, 0x72be5d74);
+	step(13, d, e, f, g, h, a, b, c, 0x80deb1fe);
+	step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7);
+	step(15, b, c, d, e, f, g, h, a, 0xc19bf174);
+	step(16, a, b, c, d, e, f, g, h, 0xe49b69c1);
+	step(17, h, a, b, c, d, e, f, g, 0xefbe4786);
+	step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6);
+	step(19, f, g, h, a, b, c, d, e, 0x240ca1cc);
+	step(20, e, f, g, h, a, b, c, d, 0x2de92c6f);
+	step(21, d, e, f, g, h, a, b, c, 0x4a7484aa);
+	step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc);
+	step(23, b, c, d, e, f, g, h, a, 0x76f988da);
+	step(24, a, b, c, d, e, f, g, h, 0x983e5152);
+	step(25, h, a, b, c, d, e, f, g, 0xa831c66d);
+	step(26, g, h, a, b, c, d, e, f, 0xb00327c8);
+	step(27, f, g, h, a, b, c, d, e, 0xbf597fc7);
+	step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3);
+	step(29, d, e, f, g, h, a, b, c, 0xd5a79147);
+	step(30, c, d, e, f, g, h, a, b, 0x06ca6351);
+	step(31, b, c, d, e, f, g, h, a, 0x14292967);
+	step(32, a, b, c, d, e, f, g, h, 0x27b70a85);
+	step(33, h, a, b, c, d, e, f, g, 0x2e1b2138);
+	step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc);
+	step(35, f, g, h, a, b, c, d, e, 0x53380d13);
+	step(36, e, f, g, h, a, b, c, d, 0x650a7354);
+	step(37, d, e, f, g, h, a, b, c, 0x766a0abb);
+	step(38, c, d, e, f, g, h, a, b, 0x81c2c92e);
+	step(39, b, c, d, e, f, g, h, a, 0x92722c85);
+	step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1);
+	step(41, h, a, b, c, d, e, f, g, 0xa81a664b);
+	step(42, g, h, a, b, c, d, e, f, 0xc24b8b70);
+	step(43, f, g, h, a, b, c, d, e, 0xc76c51a3);
+	step(44, e, f, g, h, a, b, c, d, 0xd192e819);
+	step(45, d, e, f, g, h, a, b, c, 0xd6990624);
+	step(46, c, d, e, f, g, h, a, b, 0xf40e3585);
+	step(47, b, c, d, e, f, g, h, a, 0x106aa070);
+	step(48, a, b, c, d, e, f, g, h, 0x19a4c116);
+	step(49, h, a, b, c, d, e, f, g, 0x1e376c08);
+	step(50, g, h, a, b, c, d, e, f, 0x2748774c);
+	step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5);
+	step(52, e, f, g, h, a, b, c, d, 0x391c0cb3);
+	step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a);
+	step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f);
+	step(55, b, c, d, e, f, g, h, a, 0x682e6ff3);
+	step(56, a, b, c, d, e, f, g, h, 0x748f82ee);
+	step(57, h, a, b, c, d, e, f, g, 0x78a5636f);
+	step(58, g, h, a, b, c, d, e, f, 0x84c87814);
+	step(59, f, g, h, a, b, c, d, e, 0x8cc70208);
+	step(60, e, f, g, h, a, b, c, d, 0x90befffa);
+	step(61, d, e, f, g, h, a, b, c, 0xa4506ceb);
+	step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7);
+	step(63, b, c, d, e, f, g, h, a, 0xc67178f2);
+
+	digest[0] += a;
+	digest[1] += b;
+	digest[2] += c;
+	digest[3] += d;
+	digest[4] += e;
+	digest[5] += f;
+	digest[6] += g;
+	digest[7] += h;
+}
+
+void sha256_for_mh_sha256(const uint8_t * input_data, uint32_t * digest, const uint32_t len)
+{
+	uint32_t i, j;
+	uint8_t buf[2 * SHA256_BLOCK_SIZE];
+
+	digest[0] = MH_SHA256_H0;
+	digest[1] = MH_SHA256_H1;
+	digest[2] = MH_SHA256_H2;
+	digest[3] = MH_SHA256_H3;
+	digest[4] = MH_SHA256_H4;
+	digest[5] = MH_SHA256_H5;
+	digest[6] = MH_SHA256_H6;
+	digest[7] = MH_SHA256_H7;
+
+	i = len;
+	while (i >= SHA256_BLOCK_SIZE) {
+		sha256_single_for_mh_sha256(input_data, digest);
+		input_data += SHA256_BLOCK_SIZE;
+		i -= SHA256_BLOCK_SIZE;
+	}
+
+	memcpy(buf, input_data, i);
+	buf[i++] = 0x80;
+	for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - 8); j++)
+		buf[j] = 0;
+
+	if (i > SHA256_BLOCK_SIZE - 8)
+		i = 2 * SHA256_BLOCK_SIZE;
+	else
+		i = SHA256_BLOCK_SIZE;
+
+	*(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+	sha256_single_for_mh_sha256(buf, digest);
+	if (i == (2 * SHA256_BLOCK_SIZE))
+		sha256_single_for_mh_sha256(buf + SHA256_BLOCK_SIZE, digest);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/Makefile.am b/src/crypto/isa-l/isa-l_crypto/rolling_hash/Makefile.am
new file mode 100644
index 000000000..a16209248
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/Makefile.am
@@ -0,0 +1,57 @@
+########################################################################
+#  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_x86_64         += rolling_hash/rolling_hashx_base.c
+lsrc_x86_64         += rolling_hash/rolling_hash2.c
+lsrc_x86_64         += rolling_hash/rolling_hash2_until_04.asm
+lsrc_x86_64         += rolling_hash/rolling_hash2_until_00.asm
+lsrc_x86_64         += rolling_hash/rolling_hash2_multibinary.asm
+
+lsrc_x86_32         += $(lsrc_x86_64)
+
+lsrc_base_aliases   += 	rolling_hash/rolling_hashx_base.c	\
+			rolling_hash/rolling_hash2.c	\
+			rolling_hash/rolling_hash2_base_aliases.c
+
+
+lsrc_aarch64	    += 	rolling_hash/rolling_hashx_base.c	\
+			rolling_hash/rolling_hash2.c	\
+			rolling_hash/aarch64/rolling_hash2_aarch64_multibinary.S \
+			rolling_hash/aarch64/rolling_hash2_aarch64_dispatcher.c \
+			rolling_hash/aarch64/rolling_hash2_run_until_unroll.S
+
+src_include  += -I $(srcdir)/rolling_hash
+extern_hdrs  += include/rolling_hashx.h
+
+other_src    += rolling_hash/rolling_hash2_table.h
+other_src    += include/test.h include/types.h
+
+check_tests  += rolling_hash/rolling_hash2_test
+perf_tests   += rolling_hash/rolling_hash2_perf
+other_tests  += rolling_hash/chunking_with_mb_hash
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_dispatcher.c
new file mode 100644
index 000000000..98692e162
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_dispatcher.c
@@ -0,0 +1,37 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(rolling_hash2_run_until)
+{
+	return PROVIDER_INFO(rolling_hash2_run_until_unroll);
+
+	//~ return PROVIDER_BASIC(rolling_hash2_run_until);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_multibinary.S b/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_multibinary.S
new file mode 100644
index 000000000..efbe44a18
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_multibinary.S
@@ -0,0 +1,35 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include "aarch64_multibinary.h"
+
+
+mbin_interface rolling_hash2_run_until
+
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_run_until_unroll.S b/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_run_until_unroll.S
new file mode 100644
index 000000000..7ba04efbd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_run_until_unroll.S
@@ -0,0 +1,115 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+	.align	2
+	.p2align 3,,7
+/*
+	uint64_t rolling_hash2_run_until_unroll(
+		uint32_t * idx, int max_idx, uint64_t * t1,
+		uint64_t * t2, uint8_t * b1, uint8_t * b2, uint64_t h,
+		uint64_t mask, uint64_t trigger)
+*/
+	idx_addr	.req	x0
+	max_idx		.req	w1	//signed int
+	t1_addr		.req	x2
+	t2_addr		.req	x3
+	b1_addr		.req	x4
+	b2_addr		.req	x5
+	h		.req	x6
+	mask		.req	x7
+	trigger		.req	x12
+
+	idx		.req	w8
+
+	dat1		.req	x9
+	dat2		.req	x10
+	wdat1		.req	w9
+	wdat2		.req	w10
+	tmp_loop	.req	w11
+
+	t1		.req	x13
+	t2		.req	x14
+
+.macro	round	off:req
+	ubfx	t1,dat1,8*\off\(),8
+	ubfx	t2,dat2,8*\off\(),8
+	ldr	t1,[t1_addr,t1,lsl 3]
+	ldr	t2,[t2_addr,t2,lsl 3]
+	eor	t1,t2,t1
+	eor	h,t1,h,ror 63
+	and	t2,h,mask
+	cmp	t2,trigger
+	beq	exit_ret
+	add 	idx, idx,1
+.endm
+	.global	rolling_hash2_run_until_unroll
+	.type	rolling_hash2_run_until_unroll, %function
+rolling_hash2_run_until_unroll:
+	ldr	trigger,[sp]
+	ldr	idx,[idx_addr]
+	sub	tmp_loop,max_idx,8
+	cmp	idx,tmp_loop
+	bge	unroll_loop_end
+unroll_loop:
+	ldr	dat1,[b1_addr,idx,sxtw]
+	ldr	dat2,[b2_addr,idx,sxtw]
+
+	round	0
+	round	1
+	round	2
+	round	3
+	round	4
+	round	5
+	round	6
+	round	7
+	cmp	tmp_loop,idx
+	bgt	unroll_loop
+unroll_loop_end:
+	cmp	idx,max_idx
+	bge	exit_ret
+loop:
+	ldrb	wdat1,[b1_addr,idx,sxtw]
+	ldrb	wdat2,[b2_addr,idx,sxtw]
+	ldr	t1,[t1_addr,dat1,lsl 3]
+	ldr	t2,[t2_addr,dat2,lsl 3]
+	eor	t1,t2,t1
+	eor	h,t1,h,ror 63
+	and	t2,h,mask
+	cmp	t2,trigger
+	beq	exit_ret
+	add	idx,idx,1
+	cmp	max_idx,idx
+	bgt	loop
+exit_ret:
+	str	idx,[idx_addr]
+	mov	x0,h
+	ret
+	.size	rolling_hash2_run_until_unroll, .-rolling_hash2_run_until_unroll
+
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/chunking_with_mb_hash.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/chunking_with_mb_hash.c
new file mode 100644
index 000000000..23062c3ef
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/chunking_with_mb_hash.c
@@ -0,0 +1,222 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <assert.h>
+#include "rolling_hashx.h"
+#include "sha256_mb.h"
+#include "test.h"
+
+#define MAX_BUFFER_SIZE 128*1024*1024
+#define HASH_POOL_SIZE SHA256_MAX_LANES
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define FILTER_BITS 10
+#define FILTER_SIZE (1 << FILTER_BITS)
+#define FILTER_MASK (FILTER_SIZE - 1)
+
+#define BITS_TO_INDEX_LONG 6
+#define MASK_TO_INDEX_LONG ((1 << BITS_TO_INDEX_LONG) - 1)
+
+// Globals
+SHA256_HASH_CTX ctxpool[SHA256_MAX_LANES], *last_ctx;
+SHA256_HASH_CTX_MGR mb_hash_mgr;
+uint64_t filter_table[FILTER_SIZE];
+unsigned long chunks_created = 0;
+unsigned long filter_hits = 0;
+
+// Example function to run on each chunk
+
+void run_fragment(SHA256_HASH_CTX * ctx)
+{
+	uint64_t lookup, set_hash;
+	unsigned int lookup_hash;
+	uint32_t idx;
+
+	chunks_created++;
+
+	// Run a simple lookup filter on chunk using digest
+	lookup_hash = ctx->job.result_digest[0] & FILTER_MASK;
+	lookup = filter_table[lookup_hash];
+
+	idx = ctx->job.result_digest[1];
+
+	set_hash = 1 << (idx & MASK_TO_INDEX_LONG) |
+	    1 << ((idx >> BITS_TO_INDEX_LONG) & MASK_TO_INDEX_LONG) |
+	    1 << ((idx >> (2 * BITS_TO_INDEX_LONG)) & MASK_TO_INDEX_LONG);
+
+	if ((lookup & set_hash) == set_hash)
+		filter_hits++;
+	else
+		filter_table[lookup_hash] = lookup | set_hash;
+}
+
+void setup_chunk_processing(void)
+{
+	int i;
+
+	sha256_ctx_mgr_init(&mb_hash_mgr);
+
+	for (i = 0; i < HASH_POOL_SIZE; i++)
+		hash_ctx_init(&ctxpool[i]);
+
+	last_ctx = &ctxpool[0];
+}
+
+SHA256_HASH_CTX *get_next_job_ctx(void)
+{
+	int i;
+	SHA256_HASH_CTX *ctx;
+
+	if (last_ctx && hash_ctx_complete(last_ctx))
+		return last_ctx;
+
+	for (i = 0; i < HASH_POOL_SIZE; i++) {
+		if (hash_ctx_complete(&ctxpool[i]))
+			return &ctxpool[i];
+	}
+	ctx = sha256_ctx_mgr_flush(&mb_hash_mgr);
+	assert(ctx != NULL);
+	return ctx;
+}
+
+void put_next_job_ctx(SHA256_HASH_CTX * ctx)
+{
+	if (ctx && hash_ctx_complete(ctx))
+		last_ctx = ctx;
+
+	run_fragment(ctx);
+}
+
+void process_chunk(uint8_t * buff, int len)
+{
+	SHA256_HASH_CTX *ctx;
+
+	ctx = get_next_job_ctx();
+	ctx = sha256_ctx_mgr_submit(&mb_hash_mgr, ctx, buff, len, HASH_ENTIRE);
+
+	if (ctx)
+		put_next_job_ctx(ctx);
+}
+
+void finish_chunk_processing(void)
+{
+	SHA256_HASH_CTX *ctx;
+
+	while ((ctx = sha256_ctx_mgr_flush(&mb_hash_mgr)) != NULL)
+		run_fragment(ctx);
+}
+
+int main(void)
+{
+	int i, w;
+	uint8_t *buffer, *p;
+	uint32_t mask, trigger, offset = 0;
+	uint32_t min_chunk, max_chunk, mean_chunk;
+	long remain;
+	struct rh_state2 state;
+	struct perf start, stop;
+
+	// Chunking parameters
+	w = 32;
+	min_chunk = 1024;
+	mean_chunk = 4 * 1024;
+	max_chunk = 32 * 1024;
+	mask = rolling_hashx_mask_gen(mean_chunk, 0);
+	trigger = rand() & mask;
+
+	printf("chunk and hash test w=%d, min=%d, target_ave=%d, max=%d:\n", w, min_chunk,
+	       mean_chunk, max_chunk);
+
+	if (min_chunk < w || min_chunk > max_chunk) {
+		printf(" Improper parameters selected\n");
+		return -1;
+	}
+
+	if ((buffer = malloc(MAX_BUFFER_SIZE)) == NULL) {
+		printf("cannot allocate mem\n");
+		return -1;
+	}
+	// Initialize buffer with random data
+	srand(TEST_SEED);
+	for (i = 0; i < MAX_BUFFER_SIZE; i++)
+		buffer[i] = rand();
+
+	// Start chunking test with multi-buffer hashing of results
+	perf_start(&start);
+
+	rolling_hash2_init(&state, w);
+	setup_chunk_processing();
+
+	p = buffer;
+	remain = MAX_BUFFER_SIZE;
+
+	while (remain > max_chunk) {
+		// Skip to min chunk
+		rolling_hash2_reset(&state, p + min_chunk - w);
+		rolling_hash2_run(&state, p + min_chunk, max_chunk - min_chunk,
+				  mask, trigger, &offset);
+
+		process_chunk(p, min_chunk + offset);
+
+		p += offset + min_chunk;
+		remain -= (offset + min_chunk);
+	}
+
+	while (remain > min_chunk) {
+		rolling_hash2_reset(&state, p + min_chunk - w);
+		rolling_hash2_run(&state, p + min_chunk, remain - min_chunk,
+				  mask, trigger, &offset);
+
+		process_chunk(p, min_chunk + offset);
+
+		p += offset + min_chunk;
+		remain -= (offset + min_chunk);
+	}
+
+	if (remain > 0)
+		process_chunk(p, remain);
+
+	finish_chunk_processing();
+	perf_stop(&stop);
+
+	printf("chunking_with_mb_hash: ");
+	perf_print(stop, start, MAX_BUFFER_SIZE);
+
+	printf(" found %ld chunks, ave_len=%ld, filter hits=%ld\n", chunks_created,
+	       MAX_BUFFER_SIZE / chunks_created, filter_hits);
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2.c
new file mode 100644
index 000000000..4b066e40f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2.c
@@ -0,0 +1,169 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include "rolling_hashx.h"
+#include "rolling_hash2_table.h"
+
+extern
+uint64_t rolling_hash2_run_until(uint32_t * idx, int max_idx, uint64_t * t1,
+				 uint64_t * t2, uint8_t * b1, uint8_t * b2, uint64_t h,
+				 uint64_t mask, uint64_t trigger);
+
+int rolling_hash2_init(struct rh_state2 *state, uint32_t w)
+{
+	uint32_t i;
+	uint64_t v;
+
+	if (w > FINGERPRINT_MAX_WINDOW)
+		return -1;
+
+	for (i = 0; i < 256; i++) {
+		v = rolling_hash2_table1[i];
+		state->table1[i] = v;
+		state->table2[i] = (v << w) | (v >> (64 - w));
+	}
+	state->w = w;
+	return 0;
+}
+
+void rolling_hash2_reset(struct rh_state2 *state, uint8_t * init_bytes)
+{
+	uint64_t hash;
+	uint32_t i, w;
+
+	hash = 0;
+	w = state->w;
+	for (i = 0; i < w; i++) {
+		hash = (hash << 1) | (hash >> (64 - 1));
+		hash ^= state->table1[init_bytes[i]];
+	}
+	state->hash = hash;
+	memcpy(state->history, init_bytes, w);
+}
+
+static
+uint64_t hash_fn(struct rh_state2 *state, uint64_t h, uint8_t new_char, uint8_t old_char)
+{
+	h = (h << 1) | (h >> (64 - 1));
+	h ^= state->table1[new_char] ^ state->table2[old_char];
+	return h;
+}
+
+uint64_t rolling_hash2_run_until_base(uint32_t * idx, int max_idx, uint64_t * t1,
+				      uint64_t * t2, uint8_t * b1, uint8_t * b2, uint64_t h,
+				      uint64_t mask, uint64_t trigger)
+{
+	int i = *idx;
+
+	if (trigger == 0) {
+		for (; i < max_idx; i++) {
+			h = (h << 1) | (h >> (64 - 1));
+			h ^= t1[b1[i]] ^ t2[b2[i]];
+			if ((h & mask) == 0) {
+				*idx = i;
+				return h;
+			}
+		}
+	} else {
+		for (; i < max_idx; i++) {
+			h = (h << 1) | (h >> (64 - 1));
+			h ^= t1[b1[i]] ^ t2[b2[i]];
+			if ((h & mask) == trigger) {
+				*idx = i;
+				return h;
+			}
+		}
+	}
+	*idx = i;
+	return h;
+}
+
+int
+rolling_hash2_run(struct rh_state2 *state, uint8_t * buffer, uint32_t buffer_length,
+		  uint32_t mask, uint32_t trigger, uint32_t * offset)
+{
+
+	uint32_t i;
+	uint32_t w = state->w;
+	uint64_t hash = state->hash;
+
+	for (i = 0; i < w; i++) {
+		if (i == buffer_length) {
+			*offset = i;
+			// update history
+			memmove(state->history, state->history + i, w - i);
+			memcpy(state->history + w - i, buffer, i);
+			state->hash = hash;
+			return FINGERPRINT_RET_MAX;
+		}
+		hash = hash_fn(state, hash, buffer[i], state->history[i]);
+
+		if ((hash & mask) == trigger) {
+			// found hit
+			i++;
+			*offset = i;
+			memmove(state->history, state->history + i, w - i);
+			memcpy(state->history + w - i, buffer, i);
+			state->hash = hash;
+			return FINGERPRINT_RET_HIT;
+		}
+	}
+
+	hash = rolling_hash2_run_until(&i, buffer_length, state->table1, state->table2,
+				       buffer, buffer - w, hash, mask, trigger);
+	if ((hash & mask) == trigger) {
+		// found hit
+		i++;
+		*offset = i;
+		memcpy(state->history, buffer + i - w, w);
+		state->hash = hash;
+		return FINGERPRINT_RET_HIT;
+	}
+	// no hit
+	*offset = i;
+	memcpy(state->history, buffer + i - w, w);
+	state->hash = hash;
+	return FINGERPRINT_RET_MAX;
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver rolling_hash2_init_slver_00000264;
+struct slver rolling_hash2_init_slver = { 0x0264, 0x00, 0x00 };
+
+struct slver rolling_hash2_reset_slver_00000265;
+struct slver rolling_hash2_reset_slver = { 0x0265, 0x00, 0x00 };
+
+struct slver rolling_hash2_run_slver_00000266;
+struct slver rolling_hash2_run_slver = { 0x0266, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_base_aliases.c
new file mode 100644
index 000000000..58ee50a92
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_base_aliases.c
@@ -0,0 +1,39 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+uint64_t rolling_hash2_run_until_base(uint32_t * idx, int max_idx, uint64_t * t1,
+				      uint64_t * t2, uint8_t * b1, uint8_t * b2, uint64_t h,
+				      uint64_t mask, uint64_t trigger);
+uint64_t rolling_hash2_run_until(uint32_t * idx, int max_idx, uint64_t * t1,
+				 uint64_t * t2, uint8_t * b1, uint8_t * b2, uint64_t h,
+				 uint64_t mask, uint64_t trigger)
+{
+	return rolling_hash2_run_until_base(idx, max_idx, t1, t2, b1, b2, h, mask, trigger);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_multibinary.asm
new file mode 100644
index 000000000..ad62dad74
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_multibinary.asm
@@ -0,0 +1,122 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf32
+
+[bits 32]
+%define def_wrd		dd
+%define wrd_sz  	dword
+%define arg1		esi
+
+%else
+
+default rel
+[bits 64]
+%define def_wrd 	dq
+%define wrd_sz  	qword
+%define arg1		rsi
+
+extern rolling_hash2_run_until_00
+extern rolling_hash2_run_until_04
+%endif
+
+extern rolling_hash2_run_until_base
+
+
+section .data
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+rolling_hash2_run_until_dispatched:
+	def_wrd      rolling_hash2_run_until_mbinit
+
+section .text
+
+;;;;
+; rolling_hash2_run_until multibinary function
+;;;;
+mk_global rolling_hash2_run_until, function
+rolling_hash2_run_until_mbinit:
+	endbranch
+	call	rolling_hash2_run_until_dispatch_init
+
+rolling_hash2_run_until:
+	jmp	wrd_sz [rolling_hash2_run_until_dispatched]
+
+rolling_hash2_run_until_dispatch_init:
+	push    arg1
+%ifidn __OUTPUT_FORMAT__, elf32		;; 32-bit check
+	lea     arg1, [rolling_hash2_run_until_base]
+%else
+	push    rax
+	push    rbx
+	push    rcx
+	push    rdx
+	lea     arg1, [rolling_hash2_run_until_base WRT_OPT] ; Default
+
+	mov     eax, 1
+	cpuid
+	lea     rbx, [rolling_hash2_run_until_00 WRT_OPT]
+	test    ecx, FLAG_CPUID1_ECX_SSE4_1
+	cmovne  arg1, rbx
+
+	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+	lea	rbx, [rolling_hash2_run_until_00 WRT_OPT]
+
+	jne	_done_rolling_hash2_run_until_data_init
+	mov	rsi, rbx
+
+	;; Try for AVX2
+	xor	ecx, ecx
+	mov	eax, 7
+	cpuid
+	test	ebx, FLAG_CPUID1_EBX_AVX2
+	lea     rbx, [rolling_hash2_run_until_04 WRT_OPT]
+	cmovne	rsi, rbx
+
+	;;  Does it have xmm and ymm support
+	xor     ecx, ecx
+	xgetbv
+	and     eax, FLAG_XGETBV_EAX_XMM_YMM
+	cmp     eax, FLAG_XGETBV_EAX_XMM_YMM
+	je      _done_rolling_hash2_run_until_data_init
+	lea     rsi, [rolling_hash2_run_until_00 WRT_OPT]
+
+_done_rolling_hash2_run_until_data_init:
+	pop     rdx
+	pop     rcx
+	pop     rbx
+	pop     rax
+%endif			;; END 32-bit check
+	mov     [rolling_hash2_run_until_dispatched], arg1
+	pop     arg1
+	ret
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_perf.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_perf.c
new file mode 100644
index 000000000..da0e0fba7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_perf.c
@@ -0,0 +1,120 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "rolling_hashx.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   100000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     (2 * GT_L3_CACHE)
+#  define TEST_LOOPS   50
+#  define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef FUT_run
+# define FUT_run rolling_hash2_run
+#endif
+#ifndef FUT_init
+# define FUT_init rolling_hash2_init
+#endif
+#ifndef FUT_reset
+# define FUT_reset rolling_hash2_reset
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x123f
+#endif
+
+#define TEST_MEM TEST_LEN
+
+int main(int argc, char *argv[])
+{
+	uint8_t *buf;
+	uint32_t mask, trigger, offset = 0;
+	int i, w, ret;
+	long long run_length;
+	struct rh_state2 *state;
+	struct perf start, stop;
+
+	// Case
+	w = 32;
+	mask = 0xffffffff;
+	trigger = 0x123;
+
+	printf(xstr(FUT_run) "_perf:\n");
+
+	buf = malloc(TEST_LEN);
+	if (buf == NULL) {
+		printf("alloc error: Fail\n");
+		return -1;
+	}
+	if (posix_memalign((void **)&state, 64, sizeof(struct rh_state2))) {
+		printf("alloc error rh_state: Fail\n");;
+		return -1;
+	}
+
+	srand(TEST_SEED);
+
+	for (i = 0; i < TEST_LEN; i++)
+		buf[i] = rand();
+
+	printf("Start timed tests\n");
+	fflush(0);
+
+	FUT_init(state, w);
+	FUT_reset(state, buf);
+	ret = FUT_run(state, buf, TEST_LEN, mask, trigger, &offset);
+
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++) {
+		ret = FUT_run(state, buf, TEST_LEN, mask, trigger, &offset);
+	}
+	perf_stop(&stop);
+
+	run_length = (ret == FINGERPRINT_RET_HIT) ? offset : TEST_LEN;
+	printf("  returned %d after %lld B\n", ret, run_length);
+	printf(xstr(FUT_run) TEST_TYPE_STR ": ");
+	perf_print(stop, start, run_length * i);
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_table.h b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_table.h
new file mode 100644
index 000000000..366f26374
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_table.h
@@ -0,0 +1,296 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _ROLLING_HASH2_TABLE_H_
+#define _ROLLING_HASH2_TABLE_H_
+
+// values are fractional part of pi
+// taken from:
+//   http://www.herongyang.com/Cryptography/Blowfish-First-8366-Hex-Digits-of-PI.html
+// taken from source code of BlowfishJ
+
+uint64_t rolling_hash2_table1[256] = {
+	0x243F6A8885A308D3,
+	0x13198A2E03707344,
+	0xA4093822299F31D0,
+	0x082EFA98EC4E6C89,
+	0x452821E638D01377,
+	0xBE5466CF34E90C6C,
+	0xC0AC29B7C97C50DD,
+	0x3F84D5B5B5470917,
+	0x9216D5D98979FB1B,
+	0xD1310BA698DFB5AC,
+	0x2FFD72DBD01ADFB7,
+	0xB8E1AFED6A267E96,
+	0xBA7C9045F12C7F99,
+	0x24A19947B3916CF7,
+	0x0801F2E2858EFC16,
+	0x636920D871574E69,
+	0xA458FEA3F4933D7E,
+	0x0D95748F728EB658,
+	0x718BCD5882154AEE,
+	0x7B54A41DC25A59B5,
+	0x9C30D5392AF26013,
+	0xC5D1B023286085F0,
+	0xCA417918B8DB38EF,
+	0x8E79DCB0603A180E,
+	0x6C9E0E8BB01E8A3E,
+	0xD71577C1BD314B27,
+	0x78AF2FDA55605C60,
+	0xE65525F3AA55AB94,
+	0x5748986263E81440,
+	0x55CA396A2AAB10B6,
+	0xB4CC5C341141E8CE,
+	0xA15486AF7C72E993,
+	0xB3EE1411636FBC2A,
+	0x2BA9C55D741831F6,
+	0xCE5C3E169B87931E,
+	0xAFD6BA336C24CF5C,
+	0x7A32538128958677,
+	0x3B8F48986B4BB9AF,
+	0xC4BFE81B66282193,
+	0x61D809CCFB21A991,
+	0x487CAC605DEC8032,
+	0xEF845D5DE98575B1,
+	0xDC262302EB651B88,
+	0x23893E81D396ACC5,
+	0x0F6D6FF383F44239,
+	0x2E0B4482A4842004,
+	0x69C8F04A9E1F9B5E,
+	0x21C66842F6E96C9A,
+	0x670C9C61ABD388F0,
+	0x6A51A0D2D8542F68,
+	0x960FA728AB5133A3,
+	0x6EEF0B6C137A3BE4,
+	0xBA3BF0507EFB2A98,
+	0xA1F1651D39AF0176,
+	0x66CA593E82430E88,
+	0x8CEE8619456F9FB4,
+	0x7D84A5C33B8B5EBE,
+	0xE06F75D885C12073,
+	0x401A449F56C16AA6,
+	0x4ED3AA62363F7706,
+	0x1BFEDF72429B023D,
+	0x37D0D724D00A1248,
+	0xDB0FEAD349F1C09B,
+	0x075372C980991B7B,
+	0x25D479D8F6E8DEF7,
+	0xE3FE501AB6794C3B,
+	0x976CE0BD04C006BA,
+	0xC1A94FB6409F60C4,
+	0x5E5C9EC2196A2463,
+	0x68FB6FAF3E6C53B5,
+	0x1339B2EB3B52EC6F,
+	0x6DFC511F9B30952C,
+	0xCC814544AF5EBD09,
+	0xBEE3D004DE334AFD,
+	0x660F2807192E4BB3,
+	0xC0CBA85745C8740F,
+	0xD20B5F39B9D3FBDB,
+	0x5579C0BD1A60320A,
+	0xD6A100C6402C7279,
+	0x679F25FEFB1FA3CC,
+	0x8EA5E9F8DB3222F8,
+	0x3C7516DFFD616B15,
+	0x2F501EC8AD0552AB,
+	0x323DB5FAFD238760,
+	0x53317B483E00DF82,
+	0x9E5C57BBCA6F8CA0,
+	0x1A87562EDF1769DB,
+	0xD542A8F6287EFFC3,
+	0xAC6732C68C4F5573,
+	0x695B27B0BBCA58C8,
+	0xE1FFA35DB8F011A0,
+	0x10FA3D98FD2183B8,
+	0x4AFCB56C2DD1D35B,
+	0x9A53E479B6F84565,
+	0xD28E49BC4BFB9790,
+	0xE1DDF2DAA4CB7E33,
+	0x62FB1341CEE4C6E8,
+	0xEF20CADA36774C01,
+	0xD07E9EFE2BF11FB4,
+	0x95DBDA4DAE909198,
+	0xEAAD8E716B93D5A0,
+	0xD08ED1D0AFC725E0,
+	0x8E3C5B2F8E7594B7,
+	0x8FF6E2FBF2122B64,
+	0x8888B812900DF01C,
+	0x4FAD5EA0688FC31C,
+	0xD1CFF191B3A8C1AD,
+	0x2F2F2218BE0E1777,
+	0xEA752DFE8B021FA1,
+	0xE5A0CC0FB56F74E8,
+	0x18ACF3D6CE89E299,
+	0xB4A84FE0FD13E0B7,
+	0x7CC43B81D2ADA8D9,
+	0x165FA26680957705,
+	0x93CC7314211A1477,
+	0xE6AD206577B5FA86,
+	0xC75442F5FB9D35CF,
+	0xEBCDAF0C7B3E89A0,
+	0xD6411BD3AE1E7E49,
+	0x00250E2D2071B35E,
+	0x226800BB57B8E0AF,
+	0x2464369BF009B91E,
+	0x5563911D59DFA6AA,
+	0x78C14389D95A537F,
+	0x207D5BA202E5B9C5,
+	0x832603766295CFA9,
+	0x11C819684E734A41,
+	0xB3472DCA7B14A94A,
+	0x1B5100529A532915,
+	0xD60F573FBC9BC6E4,
+	0x2B60A47681E67400,
+	0x08BA6FB5571BE91F,
+	0xF296EC6B2A0DD915,
+	0xB6636521E7B9F9B6,
+	0xFF34052EC5855664,
+	0x53B02D5DA99F8FA1,
+	0x08BA47996E85076A,
+	0x4B7A70E9B5B32944,
+	0xDB75092EC4192623,
+	0xAD6EA6B049A7DF7D,
+	0x9CEE60B88FEDB266,
+	0xECAA8C71699A17FF,
+	0x5664526CC2B19EE1,
+	0x193602A575094C29,
+	0xA0591340E4183A3E,
+	0x3F54989A5B429D65,
+	0x6B8FE4D699F73FD6,
+	0xA1D29C07EFE830F5,
+	0x4D2D38E6F0255DC1,
+	0x4CDD20868470EB26,
+	0x6382E9C6021ECC5E,
+	0x09686B3F3EBAEFC9,
+	0x3C9718146B6A70A1,
+	0x687F358452A0E286,
+	0xB79C5305AA500737,
+	0x3E07841C7FDEAE5C,
+	0x8E7D44EC5716F2B8,
+	0xB03ADA37F0500C0D,
+	0xF01C1F040200B3FF,
+	0xAE0CF51A3CB574B2,
+	0x25837A58DC0921BD,
+	0xD19113F97CA92FF6,
+	0x9432477322F54701,
+	0x3AE5E58137C2DADC,
+	0xC8B576349AF3DDA7,
+	0xA94461460FD0030E,
+	0xECC8C73EA4751E41,
+	0xE238CD993BEA0E2F,
+	0x3280BBA1183EB331,
+	0x4E548B384F6DB908,
+	0x6F420D03F60A04BF,
+	0x2CB8129024977C79,
+	0x5679B072BCAF89AF,
+	0xDE9A771FD9930810,
+	0xB38BAE12DCCF3F2E,
+	0x5512721F2E6B7124,
+	0x501ADDE69F84CD87,
+	0x7A5847187408DA17,
+	0xBC9F9ABCE94B7D8C,
+	0xEC7AEC3ADB851DFA,
+	0x63094366C464C3D2,
+	0xEF1C18473215D908,
+	0xDD433B3724C2BA16,
+	0x12A14D432A65C451,
+	0x50940002133AE4DD,
+	0x71DFF89E10314E55,
+	0x81AC77D65F11199B,
+	0x043556F1D7A3C76B,
+	0x3C11183B5924A509,
+	0xF28FE6ED97F1FBFA,
+	0x9EBABF2C1E153C6E,
+	0x86E34570EAE96FB1,
+	0x860E5E0A5A3E2AB3,
+	0x771FE71C4E3D06FA,
+	0x2965DCB999E71D0F,
+	0x803E89D65266C825,
+	0x2E4CC9789C10B36A,
+	0xC6150EBA94E2EA78,
+	0xA5FC3C531E0A2DF4,
+	0xF2F74EA7361D2B3D,
+	0x1939260F19C27960,
+	0x5223A708F71312B6,
+	0xEBADFE6EEAC31F66,
+	0xE3BC4595A67BC883,
+	0xB17F37D1018CFF28,
+	0xC332DDEFBE6C5AA5,
+	0x6558218568AB9802,
+	0xEECEA50FDB2F953B,
+	0x2AEF7DAD5B6E2F84,
+	0x1521B62829076170,
+	0xECDD4775619F1510,
+	0x13CCA830EB61BD96,
+	0x0334FE1EAA0363CF,
+	0xB5735C904C70A239,
+	0xD59E9E0BCBAADE14,
+	0xEECC86BC60622CA7,
+	0x9CAB5CABB2F3846E,
+	0x648B1EAF19BDF0CA,
+	0xA02369B9655ABB50,
+	0x40685A323C2AB4B3,
+	0x319EE9D5C021B8F7,
+	0x9B540B19875FA099,
+	0x95F7997E623D7DA8,
+	0xF837889A97E32D77,
+	0x11ED935F16681281,
+	0x0E358829C7E61FD6,
+	0x96DEDFA17858BA99,
+	0x57F584A51B227263,
+	0x9B83C3FF1AC24696,
+	0xCDB30AEB532E3054,
+	0x8FD948E46DBC3128,
+	0x58EBF2EF34C6FFEA,
+	0xFE28ED61EE7C3C73,
+	0x5D4A14D9E864B7E3,
+	0x42105D14203E13E0,
+	0x45EEE2B6A3AAABEA,
+	0xDB6C4F15FACB4FD0,
+	0xC742F442EF6ABBB5,
+	0x654F3B1D41CD2105,
+	0xD81E799E86854DC7,
+	0xE44B476A3D816250,
+	0xCF62A1F25B8D2646,
+	0xFC8883A0C1C7B6A3,
+	0x7F1524C369CB7492,
+	0x47848A0B5692B285,
+	0x095BBF00AD19489D,
+	0x1462B17423820E00,
+	0x58428D2A0C55F5EA,
+	0x1DADF43E233F7061,
+	0x3372F0928D937E41,
+	0xD65FECF16C223BDB,
+	0x7CDE3759CBEE7460,
+	0x4085F2A7CE77326E,
+	0xA607808419F8509E,
+	0xE8EFD85561D99735,
+	0xA969A7AAC50C06C2,
+};
+#endif // _ROLLING_HASH2_TABLE_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_test.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_test.c
new file mode 100644
index 000000000..ee45c120d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_test.c
@@ -0,0 +1,314 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stdio.h>
+#include "rolling_hashx.h"
+
+#ifndef FUT_run
+# define FUT_run rolling_hash2_run
+#endif
+#ifndef FUT_init
+# define FUT_init rolling_hash2_init
+#endif
+#ifndef FUT_reset
+# define FUT_reset rolling_hash2_reset
+#endif
+#ifndef FUT_ref
+# define FUT_ref rolling_hash2_ref
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define MAX_BUFFER_SIZE 128*1024*1024
+#define MAX_ROLLING_HASH_WIDTH 32
+
+#ifndef RANDOMS
+# define RANDOMS 200
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static
+uint64_t rolling_hash2_ref(struct rh_state2 *state, unsigned char *p, int len,
+			   uint64_t hash_init)
+{
+	int i;
+	uint64_t h = hash_init;
+
+	for (i = 0; i < len; i++) {
+		h = (h << 1) | (h >> (64 - 1));
+		h ^= state->table1[*p++];
+	}
+	return h;
+}
+
+int ones_in_mask(uint32_t in)
+{
+	int count;
+
+	for (count = 0; in != 0; in &= (in - 1))
+		count++;
+
+	return count;
+}
+
+/*
+ * Utility function to pick a random mask.  Not uniform in number of bits.
+ */
+uint32_t pick_rand_mask_in_range(int min_bits, int max_bits)
+{
+	uint32_t mask = 0;
+	int ones;
+
+	do {
+		mask = rand();
+#if defined(_WIN32) || defined(_WIN64)
+		mask = (mask << 16) ^ rand();
+#endif
+		ones = ones_in_mask(mask);
+	} while (ones < min_bits || ones > max_bits);
+
+	return mask;
+}
+
+int main(void)
+{
+	uint8_t *buffer;
+	uint64_t hash;
+	uint32_t mask, trigger, offset = 0;
+	int i, w, r, ret, max, errors = 0;
+	uint32_t offset_fut;
+	struct rh_state2 state;
+
+	printf(xstr(FUT_run) ": " xstr(MAX_BUFFER_SIZE));
+
+	buffer = malloc(MAX_BUFFER_SIZE);
+	if (buffer == NULL) {
+		printf("cannot allocate mem\n");
+		return -1;
+	}
+	srand(TEST_SEED);
+
+	// Test case 1, compare trigger case at boundary with reference hash
+	w = 32;
+	mask = 0xffff0;
+	trigger = 0x3df0;
+	trigger &= mask;
+
+	for (i = 0; i < MAX_BUFFER_SIZE; i++)
+		buffer[i] = rand();
+
+	FUT_init(&state, w);
+	FUT_reset(&state, buffer);
+
+	uint8_t *p = buffer;
+	int remain = MAX_BUFFER_SIZE;
+	ret = FINGERPRINT_RET_HIT;
+
+	while ((ret == FINGERPRINT_RET_HIT) && (remain > 0)) {
+		ret = FUT_run(&state, p, remain, mask, trigger, &offset);
+
+		if (offset > remain) {
+			printf(" error offset past remaining limit\n");
+			errors++;
+		}
+
+		if ((ret == FINGERPRINT_RET_HIT) && (&p[offset] > &buffer[w])) {
+			hash = FUT_ref(&state, &p[offset] - w, w, 0);
+			if ((hash & mask) != trigger) {
+				printf("   mismatch chunk from ref");
+				printf(" hit: offset=%d %lx %lx\n", offset, state.hash, hash);
+				errors++;
+			}
+		}
+		p += offset;
+		remain -= offset;
+		putchar('.');
+	}
+
+	putchar('.');		// Finished test 1
+
+	// Test case 2, check if reference function hits same chunk boundary as test
+
+	w = 32;
+	mask = 0xffff;
+	trigger = rand();
+	trigger &= mask;
+	p = buffer;
+
+	// Function under test
+	FUT_init(&state, w);
+	FUT_reset(&state, p);
+	ret = FUT_run(&state, p + w, MAX_BUFFER_SIZE - w, mask, trigger, &offset_fut);
+	offset_fut += w;
+
+	// Reference
+	for (p++, offset = w + 1; offset < MAX_BUFFER_SIZE; offset++) {
+		hash = FUT_ref(&state, p++, w, 0);
+		if ((hash & mask) == trigger)
+			break;
+	}
+
+	if (offset != offset_fut) {
+		printf("\ncase 2, offset of chunk different from ref\n");
+		printf("  case 2: stop fut at offset=%d\n", offset_fut);
+		printf("  case 2: stop ref at offset=%d\n", offset);
+		errors++;
+		return errors;
+	}
+	putchar('.');		// Finished test 2
+
+	// Do case 2 above with random args
+
+	for (r = 0; r < RANDOMS; r++) {
+		w = rand() % MAX_ROLLING_HASH_WIDTH;
+		if (w < 3)
+			continue;
+
+		mask = pick_rand_mask_in_range(4, 20);
+		trigger = rand() & mask;
+		p = buffer;
+
+		// Function under test
+		FUT_init(&state, w);
+		FUT_reset(&state, p);
+		ret = FUT_run(&state, p + w, MAX_BUFFER_SIZE - w, mask, trigger, &offset_fut);
+		offset_fut += w;
+
+		// Reference
+		for (p++, offset = w + 1; offset < MAX_BUFFER_SIZE; offset++) {
+			hash = FUT_ref(&state, p++, w, 0);
+			if ((hash & mask) == trigger)
+				break;
+		}
+
+		if (offset != offset_fut) {
+			printf("\nrand case 2 #%d: w=%d, mask=0x%x, trigger=0x%x\n", r, w,
+			       mask, trigger);
+			printf("  offset of chunk different from ref\n");
+			printf("  case 2r: stop fut at offset=%d\n", offset_fut);
+			printf("  case 2r: stop ref at offset=%d\n", offset);
+			errors++;
+			return errors;
+		}
+		putchar('.');
+	}
+
+	// Test case 3, check if max bound is same
+
+	w = 32;
+	mask = 0xfffff;
+	trigger = rand();
+	trigger &= mask;
+	putchar('|');
+
+	for (max = w + 1; max < 500; max++) {
+		p = buffer;
+		FUT_init(&state, w);
+		FUT_reset(&state, p);
+
+		ret = FUT_run(&state, p + w, max - w, mask, trigger, &offset_fut);
+		offset_fut += w;
+
+		int ret_ref = FINGERPRINT_RET_MAX;
+		for (p++, offset = w + 1; offset < max; offset++) {
+			hash = FUT_ref(&state, p++, w, 0);
+			if ((hash & mask) == trigger) {
+				ret_ref = FINGERPRINT_RET_HIT;
+				break;
+			}
+		}
+
+		if (offset != offset_fut || ret != ret_ref) {
+			printf("\ncase 3 max=%d, offset of chunk different from ref\n", max);
+			printf("  case 3: stop fut at offset=%d\n", offset_fut);
+			printf("  case 3: stop ref at offset=%d\n", offset);
+			printf("  case 3: ret_fut=%d ret_ref=%d\n", ret, ret_ref);
+			errors++;
+			return errors;
+		}
+		putchar('.');	// Finished test 3
+	}
+
+	// Test case 4, check if max bound is same under random params
+
+	for (r = 0; r < RANDOMS; r++) {
+		p = buffer;
+		mask = pick_rand_mask_in_range(24, 30);	// Pick an unlikely mask
+		trigger = rand() & mask;
+		w = rand() % MAX_ROLLING_HASH_WIDTH;
+		max = rand() % 1024;
+
+		if (w < 3 || max < 2 * MAX_ROLLING_HASH_WIDTH)
+			continue;
+
+		FUT_init(&state, w);
+		FUT_reset(&state, p);
+
+		ret = FUT_run(&state, p, max, mask, trigger, &offset_fut);
+
+		if (offset_fut <= w)
+			continue;
+
+		int ret_ref = FINGERPRINT_RET_MAX;
+		for (p++, offset = w + 1; offset < max; offset++) {
+			hash = FUT_ref(&state, p++, w, 0);
+			if ((hash & mask) == trigger) {
+				ret_ref = FINGERPRINT_RET_HIT;
+				break;
+			}
+		}
+
+		if (offset != offset_fut || ret != ret_ref) {
+			printf("\ncase 4 rand case different from ref, max=%d w=%d\n", max, w);
+			printf("  case 4: stop fut at offset=%d\n", offset_fut);
+			printf("  case 4: stop ref at offset=%d\n", offset);
+			printf("  case 4: ret_fut=%d ret_ref=%d\n", ret, ret_ref);
+			errors++;
+			return errors;
+		}
+		putchar('.');	// Finished test 4
+
+		if (ret == FINGERPRINT_RET_HIT) {
+			p[-1] = rand();	// Keep hits from repeating
+		}
+	}
+
+	if (errors > 0)
+		printf(" Fail: %d\n", errors);
+	else
+		printf(" Pass\n");
+	return errors;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_00.asm b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_00.asm
new file mode 100644
index 000000000..99091faa4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_00.asm
@@ -0,0 +1,204 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; uint64_t rolling_hash2_run_until_00(uint32_t *idx, uint32_t buffer_length, uint64_t *t1,
+;;; 			uint64_t *t2, uint8_t *b1, uint8_t *b2, uint64_t h, uint64_t mask,
+;;;			uint64_t trigger)
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define arg6  r10
+ %define arg7  r11
+ %define arg8  r12		; must be saved and loaded
+ %define tmp1  rbp		; must be saved and loaded
+ %define tmp2  rbx		; must be saved and loaded
+ %define tmp3  r13		; must be saved and loaded
+ %define tmp4  r14		; must be saved and loaded
+ %define tmp5  r15		; must be saved and loaded
+ %define return rax
+ %define PS 8
+ %define frame_size 6*8
+ %define arg(x)      [rsp + frame_size + PS + PS*x]
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	rbp
+	push	rbx
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	mov	arg6, arg(0)
+	mov	arg7, arg(1)
+	mov	arg8, arg(2)
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbx
+	pop	rbp
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+ %define arg4   r12 		; must be saved and loaded
+ %define arg5   r13 		; must be saved and loaded
+ %define arg6   r14 		; must be saved and loaded
+ %define arg7   r15 		; must be saved and loaded
+ %define arg8   rbx 		; must be saved and loaded
+ %define tmp1   r10
+ %define tmp2   r11
+ %define tmp3   rdi 		; must be saved and loaded
+ %define tmp4   rsi 		; must be saved and loaded
+ %define tmp5   rbp 		; must be saved and loaded
+ %define return rax
+ %define PS 8
+ %define frame_size 8*8
+ %define arg(x)      [rsp + frame_size + PS + PS*x]
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	push_reg	r12
+	push_reg	r13
+	push_reg	r14
+	push_reg	r15
+	push_reg	rbx
+	push_reg	rdi
+	push_reg	rsi
+	push_reg	rbp
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+	mov	arg6, arg(6)
+	mov	arg7, arg(7)
+	mov	arg8, arg(8)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	rbp
+	pop	rsi
+	pop	rdi
+	pop	rbx
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+%define idx   arg0
+%define max   arg1
+%define t1    arg2
+%define t2    arg3
+%define b1    arg4
+%define b2    arg5
+%define hash  arg6
+%define mask  arg7
+%define trigger arg8
+
+%define pos   rax
+%define pos.w eax
+%define x     tmp2
+%define y     tmp3
+%define z     tmp4
+%define h     tmp1
+%define a     tmp5
+
+default rel
+[bits 64]
+section .text
+
+align 16
+mk_global rolling_hash2_run_until_00, function
+func(rolling_hash2_run_until_00)
+	endbranch
+	FUNC_SAVE
+	mov	pos.w, dword [idx]
+	sub	max, 2
+	cmp	pos, max
+	jg	.less_than_2
+
+.loop2:	ror	hash, 0x3f
+	movzx	x, byte [b1 + pos]
+	movzx	a, byte [b1 + pos + 1]
+	movzx	y, byte [b2 + pos]
+	movzx	h, byte [b2 + pos + 1]
+	mov	z, [t1 + x * 8]
+	xor	z, [t2 + y * 8]
+	xor	hash, z
+	mov	x, hash
+	and	x, mask
+	cmp	x, trigger
+	je	.ret_0
+
+	ror	hash, 0x3f
+	mov	z, [t1 + a * 8]
+	xor	z, [t2 + h * 8]
+	xor	hash, z
+	mov	y, hash
+	and	y, mask
+	cmp	y, trigger
+	je	.ret_1
+
+	add	pos, 2
+	cmp	pos, max
+	jle	.loop2
+
+.less_than_2:
+	add	max, 1
+	cmp	pos, max
+	jg	.ret_0
+	ror	hash, 0x3f
+	movzx	x, byte [b1 + pos]
+	movzx	y, byte [b2 + pos]
+	mov	z, [t1 + x * 8]
+	xor	z, [t2 + y * 8]
+	xor	hash, z
+.ret_1:	add	pos, 1
+.ret_0:	mov	dword [idx], pos.w
+	mov	rax, hash
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_04.asm b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_04.asm
new file mode 100644
index 000000000..3f4e8353b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_04.asm
@@ -0,0 +1,203 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; uint64_t rolling_hash2_run_until_04(uint32_t *idx, uint32_t max_idx, uint64_t *t1,
+;;; 			uint64_t *t2, uint8_t *b1, uint8_t *b2, uint64_t h, uint64_t mask,
+;;;			uint64_t trigger)
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define arg6  r10
+ %define arg7  r11
+ %define arg8  r12		; must be saved and loaded
+ %define tmp1  rbp		; must be saved and loaded
+ %define tmp2  rbx		; must be saved and loaded
+ %define tmp3  r13		; must be saved and loaded
+ %define tmp4  r14		; must be saved and loaded
+ %define tmp5  r15		; must be saved and loaded
+ %define return rax
+ %define PS 8
+ %define frame_size 6*8
+ %define arg(x)      [rsp + frame_size + PS + PS*x]
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	rbp
+	push	rbx
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	mov	arg6, arg(0)
+	mov	arg7, arg(1)
+	mov	arg8, arg(2)
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbx
+	pop	rbp
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+ %define arg4   r12 		; must be saved and loaded
+ %define arg5   r13 		; must be saved and loaded
+ %define arg6   r14 		; must be saved and loaded
+ %define arg7   r15 		; must be saved and loaded
+ %define arg8   rbx 		; must be saved and loaded
+ %define tmp1   r10
+ %define tmp2   r11
+ %define tmp3   rdi 		; must be saved and loaded
+ %define tmp4   rsi 		; must be saved and loaded
+ %define tmp5   rbp 		; must be saved and loaded
+ %define return rax
+ %define PS 8
+ %define frame_size 8*8
+ %define arg(x)      [rsp + frame_size + PS + PS*x]
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	push_reg	r12
+	push_reg	r13
+	push_reg	r14
+	push_reg	r15
+	push_reg	rbx
+	push_reg	rdi
+	push_reg	rsi
+	push_reg	rbp
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+	mov	arg6, arg(6)
+	mov	arg7, arg(7)
+	mov	arg8, arg(8)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	rbp
+	pop	rsi
+	pop	rdi
+	pop	rbx
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+%define idx   arg0
+%define max   arg1
+%define t1    arg2
+%define t2    arg3
+%define b1    arg4
+%define b2    arg5
+%define hash  arg6
+%define mask  arg7
+%define trigger arg8
+
+%define pos   rax
+%define pos.w eax
+%define x     tmp2
+%define y     tmp3
+%define z     tmp4
+%define h     tmp1
+%define a     tmp5
+
+default rel
+[bits 64]
+section .text
+
+align 16
+mk_global rolling_hash2_run_until_04, function
+func(rolling_hash2_run_until_04)
+	endbranch
+	FUNC_SAVE
+	mov	pos.w, dword [idx]
+	pext	trigger, trigger, mask
+	sub	max, 2
+	cmp	pos, max
+	jg	.less_than_2
+
+.loop2:	rorx	hash, hash, 0x3f
+	movzx	x, byte [b1 + pos]
+	movzx	a, byte [b1 + pos + 1]
+	movzx	y, byte [b2 + pos]
+	movzx	h, byte [b2 + pos + 1]
+	mov	z, [t1 + x * 8]
+	xor	z, [t2 + y * 8]
+	xor	hash, z
+	pext	x, hash, mask
+	cmp	x, trigger
+	je	.ret_0
+
+	rorx	hash, hash, 0x3f
+	mov	z, [t1 + a * 8]
+	xor	z, [t2 + h * 8]
+	xor	hash, z
+	pext	y, hash, mask
+	cmp	y, trigger
+	je	.ret_1
+
+	add	pos, 2
+	cmp	pos, max
+	jle	.loop2
+
+.less_than_2:
+	add	max, 1
+	cmp	pos, max
+	jg	.ret_0
+	rorx	hash, hash, 0x3f
+	movzx	x, byte [b1 + pos]
+	movzx	y, byte [b2 + pos]
+	mov	z, [t1 + x * 8]
+	xor	z, [t2 + y * 8]
+	xor	hash, z
+.ret_1:	add	pos, 1
+.ret_0:	mov	dword [idx], pos.w
+	mov	rax, hash
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hashx_base.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hashx_base.c
new file mode 100644
index 000000000..4197def0e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hashx_base.c
@@ -0,0 +1,65 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+inline int floor_pow2(uint32_t in)
+{
+	uint32_t x = in;
+
+	while (in) {
+		x = in;
+		in &= (in - 1);
+	}
+	return x;
+}
+
+inline uint32_t rol(uint32_t x, int i)
+{
+	return x << i | x >> (8 * sizeof(x) - i);
+}
+
+uint32_t rolling_hashx_mask_gen(long mean, int shift)
+{
+	if (mean <= 2)
+		mean = 2;
+
+	return rol(floor_pow2(mean) - 1, shift);
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver rolling_hashx_mask_gen_slver_00000260;
+struct slver rolling_hashx_mask_gen_slver = { 0x0260, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am
new file mode 100644
index 000000000..3f3c589ad
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am
@@ -0,0 +1,130 @@
+########################################################################
+#  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_x86_64 += 	sha1_mb/sha1_ctx_sse.c \
+		sha1_mb/sha1_ctx_avx.c \
+		sha1_mb/sha1_ctx_avx2.c \
+		sha1_mb/sha1_ctx_base.c
+
+lsrc_x86_64 += 	sha1_mb/sha1_mb_mgr_init_sse.c \
+		sha1_mb/sha1_mb_mgr_init_avx2.c
+
+lsrc_x86_64 += 	sha1_mb/sha1_mb_mgr_submit_sse.asm \
+		sha1_mb/sha1_mb_mgr_submit_avx.asm \
+		sha1_mb/sha1_mb_mgr_submit_avx2.asm \
+		sha1_mb/sha1_mb_mgr_flush_sse.asm \
+		sha1_mb/sha1_mb_mgr_flush_avx.asm \
+		sha1_mb/sha1_mb_mgr_flush_avx2.asm \
+		sha1_mb/sha1_mb_x4_sse.asm \
+		sha1_mb/sha1_mb_x4_avx.asm \
+		sha1_mb/sha1_mb_x8_avx2.asm \
+		sha1_mb/sha1_multibinary.asm
+
+lsrc_x86_64 += 	sha1_mb/sha1_ctx_avx512.c \
+		sha1_mb/sha1_mb_mgr_init_avx512.c \
+		sha1_mb/sha1_mb_mgr_submit_avx512.asm \
+		sha1_mb/sha1_mb_mgr_flush_avx512.asm \
+		sha1_mb/sha1_mb_x16_avx512.asm
+
+lsrc_x86_64 += 	sha1_mb/sha1_opt_x1.asm
+
+lsrc_x86_64 += 	sha1_mb/sha1_ni_x1.asm \
+		sha1_mb/sha1_ni_x2.asm \
+		sha1_mb/sha1_ctx_sse_ni.c \
+		sha1_mb/sha1_ctx_avx512_ni.c \
+		sha1_mb/sha1_mb_mgr_submit_sse_ni.asm \
+		sha1_mb/sha1_mb_mgr_flush_sse_ni.asm \
+		sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm
+
+lsrc_x86_32 += 	$(lsrc_x86_64)
+
+lsrc_aarch64 += sha1_mb/sha1_ctx_base.c \
+		sha1_mb/sha1_ref.c \
+		sha1_mb/aarch64/sha1_mb_multibinary.S \
+		sha1_mb/aarch64/sha1_ctx_ce.c	\
+		sha1_mb/aarch64/sha1_mb_x1_ce.S		\
+		sha1_mb/aarch64/sha1_mb_x2_ce.S		\
+		sha1_mb/aarch64/sha1_mb_mgr_ce.c	\
+		sha1_mb/aarch64/sha1_ctx_asimd.c	\
+		sha1_mb/aarch64/sha1_aarch64_x1.S	\
+		sha1_mb/aarch64/sha1_mb_asimd_x4.S	\
+		sha1_mb/aarch64/sha1_mb_mgr_asimd.c	\
+		sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c
+
+
+
+lsrc_base_aliases += sha1_mb/sha1_ctx_base_aliases.c	\
+		sha1_mb/sha1_ctx_base.c \
+		sha1_mb/sha1_ref.c
+
+src_include += -I $(srcdir)/sha1_mb
+
+extern_hdrs +=  include/sha1_mb.h \
+		include/multi_buffer.h
+
+other_src += 	include/datastruct.asm \
+		include/multibinary.asm \
+		sha1_mb/sha1_job.asm \
+		sha1_mb/sha1_mb_mgr_datastruct.asm \
+		include/reg_sizes.asm \
+		sha1_mb/sha1_ref.c \
+		include/memcpy_inline.h \
+		include/memcpy.asm \
+		include/intrinreg.h
+
+check_tests  += sha1_mb/sha1_mb_test \
+		sha1_mb/sha1_mb_rand_test \
+		sha1_mb/sha1_mb_rand_update_test \
+		sha1_mb/sha1_mb_flush_test
+
+unit_tests   += sha1_mb/sha1_mb_rand_ssl_test
+
+perf_tests   += sha1_mb/sha1_mb_vs_ossl_perf \
+		sha1_mb/sha1_mb_vs_ossl_shortage_perf
+
+examples    +=  sha1_mb/sha1_multi_buffer_example
+
+
+sha1_mb_rand_test: sha1_ref.o
+sha1_mb_sha1_mb_rand_test_LDADD = sha1_mb/sha1_ref.lo libisal_crypto.la
+
+sha1_mb_rand_update_test: sha1_ref.o
+sha1_mb_sha1_mb_rand_update_test_LDADD = sha1_mb/sha1_ref.lo libisal_crypto.la
+
+sha1_mb_flush_test: sha1_ref.o
+sha1_mb_sha1_mb_flush_test_LDADD = sha1_mb/sha1_ref.lo libisal_crypto.la
+
+sha1_mb_rand_ssl_test: LDLIBS += -lcrypto
+sha1_mb_sha1_mb_rand_ssl_test_LDFLAGS = -lcrypto
+
+sha1_mb_vs_ossl_perf: LDLIBS += -lcrypto
+sha1_mb_sha1_mb_vs_ossl_perf_LDFLAGS = -lcrypto
+
+sha1_mb_vs_ossl_shortage_perf: LDLIBS += -lcrypto
+sha1_mb_sha1_mb_vs_ossl_shortage_perf_LDFLAGS = -lcrypto
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S
new file mode 100644
index 000000000..55d6f932f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S
@@ -0,0 +1,294 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+	.arch armv8-a
+
+	input_data	.req	x0
+	num_blocks	.req	w1
+	digest	.req	x2
+
+	// x2 is reused intentionally between digest/tmp
+	// due to running out of registers
+	TMP	.req	x2
+	TMPW	.req	w2
+	sha1key_adr	.req	x3
+	WK	.req	w3
+	WF	.req	w4
+	WA	.req w5
+	WB	.req w6
+	WC	.req w7
+	WD	.req w8
+	WE	.req w9
+	WORD0	.req w10
+	WORD1	.req w11
+	WORD2	.req w12
+	WORD3	.req w13
+	WORD4	.req w14
+	WORD5	.req w15
+	WORD6	.req w16
+	WORD7	.req w17
+	WORD8	.req w18
+	WORD9	.req w19
+	WORD10	.req w20
+	WORD11	.req w21
+	WORD12	.req w22
+	WORD13	.req w23
+	WORD14	.req w24
+	WORD15	.req w25
+	AA	.req w26
+	BB	.req w27
+	CC	.req w28
+	DD	.req w29
+	EE	.req w30
+
+	TT	.req w0
+
+.macro save_stack
+	stp	x16,x17,[sp, -128]!
+	stp	x18,x19,[sp, 16]
+	stp	x20,x21,[sp, 32]
+	stp	x22,x23,[sp, 48]
+	stp	x24,x25,[sp, 64]
+	stp	x26,x27,[sp, 80]
+	stp	x28,x29,[sp, 96]
+	str	x30,[sp, 112]
+	// have to reuse x2, which is digest address
+	str	x2,[sp, 120]
+.endm
+
+.macro restore_stack
+	ldp	x18,x19,[sp, 16]
+	ldp	x20,x21,[sp, 32]
+	ldp	x22,x23,[sp, 48]
+	ldp	x24,x25,[sp, 64]
+	ldp	x26,x27,[sp, 80]
+	ldp	x28,x29,[sp, 96]
+	ldr	x30,[sp, 112]
+	ldr	x2,[sp, 120]
+	ldp	x16,x17,[sp],128
+.endm
+// macro F = (D ^ (B & (C ^ D)))
+.macro FUNC_F0
+	eor	WF, WC, WD
+	and	WF, WB, WF
+	eor	WF, WD, WF
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F1
+	eor	WF, WB, WC
+	eor	WF, WF, WD
+.endm
+
+// F = ((B & C) | (B & D) | (C & D))
+.macro FUNC_F2
+	and	TMPW, WB, WC
+	and	WF, WB, WD
+	orr	WF, WF, TMPW
+	and	TMPW, WC, WD
+	orr	WF, WF, TMPW
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F3
+	FUNC_F1
+.endm
+
+.altmacro
+.macro load_next_word windex
+	.if \windex < 16
+		load_word_at	\windex
+	.endif
+.endm
+
+.macro SHA1_STEP_00_15 windex:req
+	rev	WORD\windex\(),WORD\windex\()
+	next_word=\windex+1
+	load_next_word	%next_word
+
+	ror	TMPW,WA,#32-5
+	add	WE,WE,TMPW
+	add	WE,WE,WK
+	FUNC_F0
+	ror	WB,WB,#32-30
+	add	WE,WE,WORD\windex\()
+	add	WE,WE,WF
+.endm
+
+.macro SHA1_STEP_16_79 windex:req,func_f:req,reg_3:req,reg_8:req,reg_14:req,reg_16:req
+	eor	TMPW,\reg_14,\reg_8
+	eor	\reg_16,\reg_16,\reg_3
+	eor	\reg_16,\reg_16,TMPW
+
+	ror	TMPW,WA,#32-5
+	ror	\reg_16,\reg_16, #32 - 1
+
+	add	WE,WE,TMPW
+	add	WE,WE,WK
+	\func_f
+	ror WB,WB,#32-30
+	add	WE,WE,\reg_16
+	add	WE,WE,WF
+.endm
+
+.macro SWAP_STATES
+	.unreq TT
+	TT .req WE
+	.unreq WE
+	WE .req WD
+	.unreq WD
+	WD .req WC
+	.unreq WC
+	WC .req WB
+	.unreq WB
+	WB .req WA
+	.unreq WA
+	WA .req TT
+.endm
+
+.altmacro
+.macro SHA1_STEP_16_79_WRAPPER windex:req,func_f:req,idx3:req,idx8:req,idx14:req,idx16:req
+	SHA1_STEP_16_79 \windex,\func_f,WORD\idx3\(),WORD\idx8\(),WORD\idx14\(),WORD\idx16\()
+.endm
+
+.macro exec_step windex:req
+	.if \windex <= 15
+		SHA1_STEP_00_15	windex
+	.else
+		idx14=((\windex - 14) & 15)
+		idx8=((\windex - 8) & 15)
+		idx3=((\windex - 3) & 15)
+		idx16=(\windex & 15)
+		.if \windex <= 19
+			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F0,%idx3,%idx8,%idx14,%idx16
+		.endif
+		.if \windex >= 20 && \windex <= 39
+			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F1,%idx3,%idx8,%idx14,%idx16
+		.endif
+		.if \windex >= 40 && \windex <= 59
+			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F2,%idx3,%idx8,%idx14,%idx16
+		.endif
+		.if \windex >= 60 && \windex <= 79
+			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F3,%idx3,%idx8,%idx14,%idx16
+		.endif
+	.endif
+
+	SWAP_STATES
+.endm
+
+.macro exec_steps idx:req,more:vararg
+	exec_step	\idx
+	.ifnb \more
+		exec_steps	\more
+	.endif
+.endm
+
+.altmacro
+
+.macro load_two_words_at idx0:req,idx1:req
+	ldp	WORD\idx0\(),WORD\idx1\(),[input_data],8
+.endm
+
+.macro load_word_at idx:req
+	.if \idx % 2 == 0
+		idx1=\idx+1
+		load_two_words_at	\idx,%idx1
+	.endif
+.endm
+
+/*
+ *  void sha1_aarch64_x1(uint32_t *input_data, int num_blocks, uint32_t digest[5])
+ */
+	.global sha1_aarch64_x1
+	.type sha1_aarch64_x1, %function
+sha1_aarch64_x1:
+	cmp	num_blocks, #0
+	beq	.return
+
+	ldp	WA,WB,[digest]
+	ldp	WC,WD,[digest,8]
+	ldr	WE,[digest,16]
+	save_stack
+
+.block_loop:
+	mov	AA, WA
+	mov	BB, WB
+	mov	CC, WC
+	mov	DD, WD
+	mov	EE, WE
+
+	load_word_at	0
+
+	adr	sha1key_adr, KEY_0
+	ldr	WK, [sha1key_adr]
+	exec_steps	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
+
+	// 20 ~ 39
+	adr	sha1key_adr, KEY_1
+	ldr	WK, [sha1key_adr]
+	exec_steps	20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
+
+	// 40 ~ 59
+	adr	sha1key_adr, KEY_2
+	ldr	WK, [sha1key_adr]
+	exec_steps	40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
+
+	// 60 ~ 79
+	adr	sha1key_adr, KEY_3
+	ldr	WK, [sha1key_adr]
+	exec_steps	60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79
+
+	add	WA, AA, WA
+	add	WB, BB, WB
+	add	WC, CC, WC
+	add	WD, DD, WD
+	add	WE, EE, WE
+
+	subs	num_blocks, num_blocks, 1
+	bne	.block_loop
+
+	restore_stack
+	stp	WA,WB,[digest]
+	stp	WC,WD,[digest,8]
+	str	WE,[digest,16]
+
+.return:
+	ret
+
+	.size sha1_aarch64_x1, .-sha1_aarch64_x1
+	.section .rodata.cst16,"aM",@progbits,16
+	.align  16
+KEY_0:
+	.word	0x5a827999
+KEY_1:
+	.word	0x6ed9eba1
+KEY_2:
+	.word	0x8f1bbcdc
+KEY_3:
+	.word	0xca62c1d6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_asimd_common.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_asimd_common.S
new file mode 100644
index 000000000..c8b8dd982
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_asimd_common.S
@@ -0,0 +1,269 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+	.arch armv8-a
+
+// macro F = (D ^ (B & (C ^ D)))
+.macro FUNC_F0
+	eor	VF.16b, VC.16b, VD.16b
+	and	VF.16b, VB.16b, VF.16b
+	eor	VF.16b, VD.16b, VF.16b
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F1
+	eor	VF.16b, VB.16b, VC.16b
+	eor	VF.16b, VF.16b, VD.16b
+.endm
+
+// F = ((B & C) | (B & D) | (C & D))
+.macro FUNC_F2
+	and	vT0.16b, VB.16b, VC.16b
+	and	vT1.16b, VB.16b, VD.16b
+	and	vT2.16b, VC.16b, VD.16b
+	orr	VF.16b, vT0.16b, vT1.16b
+	orr	VF.16b, VF.16b, vT2.16b
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F3
+	FUNC_F1
+.endm
+
+.altmacro
+.macro load_next_word windex
+	.if \windex < 16
+		load_x4_word	\windex
+	.endif
+.endm
+
+// FUNC_F0 is merged into STEP_00_15 for efficiency
+.macro SHA1_STEP_00_15_F0 windex:req
+	rev32	WORD\windex\().16b,WORD\windex\().16b
+	next_word=\windex+1
+	load_next_word %next_word
+	// e = (a leftrotate 5) + f + e + k + w[i]
+	ushr	VT.4s, VA.4s, 32 - 5
+	add	VE.4s, VE.4s, VK.4s
+	sli	VT.4s, VA.4s, 5
+	eor	VF.16b, VC.16b, VD.16b
+	add	VE.4s, VE.4s, WORD\windex\().4s
+	and	VF.16b, VB.16b, VF.16b
+	add	VE.4s, VE.4s, VT.4s
+	eor	VF.16b, VD.16b, VF.16b
+	ushr	VT.4s, VB.4s, 32 - 30
+	add	VE.4s, VE.4s, VF.4s
+	sli	VT.4s, VB.4s, 30
+.endm
+
+.macro SHA1_STEP_16_79 windex:req,func_f:req,reg_3:req,reg_8:req,reg_14:req,reg_16:req
+	eor	vT0.16b,\reg_3\().16b,\reg_8\().16b
+	eor	VT.16b,\reg_14\().16b,\reg_16\().16b
+	eor	vT0.16b,vT0.16b,VT.16b
+	// e = (a leftrotate 5) + f + e + k + w[i]
+	ushr	VT.4s, vT0.4s, 32 - 1
+	add	VE.4s, VE.4s, VK.4s
+	ushr	vT1.4s, VA.4s, 32 - 5
+	sli	VT.4s, vT0.4s, 1
+	add	VE.4s, VE.4s, VT.4s
+	sli	vT1.4s, VA.4s, 5
+	mov	\reg_16\().16b,VT.16b
+	add	VE.4s, VE.4s, vT1.4s
+	ushr	VT.4s, VB.4s, 32 - 30
+	\func_f
+	add	VE.4s, VE.4s, VF.4s
+	sli	VT.4s, VB.4s, 30
+.endm
+
+	VA	.req v0
+	VB	.req v1
+	VC	.req v2
+	VD	.req v3
+	VE	.req v4
+	VT	.req v5
+	VF	.req v6
+	VK	.req v7
+	WORD0	.req v8
+	WORD1	.req v9
+	WORD2	.req v10
+	WORD3	.req v11
+	WORD4	.req v12
+	WORD5	.req v13
+	WORD6	.req v14
+	WORD7	.req v15
+	WORD8	.req v16
+	WORD9	.req v17
+	WORD10	.req v18
+	WORD11	.req v19
+	WORD12	.req v20
+	WORD13	.req v21
+	WORD14	.req v22
+	WORD15	.req v23
+	vT0	.req v24
+	vT1	.req v25
+	vT2	.req v26
+	vAA	.req v27
+	vBB	.req v28
+	vCC	.req v29
+	vDD	.req v30
+	vEE	.req v31
+	TT	.req v0
+	sha1key_adr	.req	x15
+
+.macro SWAP_STATES
+	// shifted VB is held in VT after each step
+	.unreq TT
+	TT .req VE
+	.unreq VE
+	VE .req VD
+	.unreq VD
+	VD .req VC
+	.unreq VC
+	VC .req VT
+	.unreq	VT
+	VT .req VB
+	.unreq VB
+	VB .req VA
+	.unreq VA
+	VA .req TT
+.endm
+
+.altmacro
+.macro SHA1_STEP_16_79_WRAPPER windex:req,func_f:req,idx3:req,idx8:req,idx14:req,idx16:req
+	SHA1_STEP_16_79 \windex,\func_f,WORD\idx3\(),WORD\idx8\(),WORD\idx14\(),WORD\idx16\()
+.endm
+
+.macro exec_step windex:req
+	.if \windex <= 15
+		SHA1_STEP_00_15_F0	windex
+	.else
+		idx14=((\windex - 14) & 15)
+		idx8=((\windex - 8) & 15)
+		idx3=((\windex - 3) & 15)
+		idx16=(\windex & 15)
+		.if \windex <= 19
+			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F0,%idx3,%idx8,%idx14,%idx16
+		.endif
+		.if \windex >= 20 && \windex <= 39
+			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F1,%idx3,%idx8,%idx14,%idx16
+		.endif
+		.if \windex >= 40 && \windex <= 59
+			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F2,%idx3,%idx8,%idx14,%idx16
+		.endif
+		.if \windex >= 60 && \windex <= 79
+			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F3,%idx3,%idx8,%idx14,%idx16
+		.endif
+	.endif
+
+	SWAP_STATES
+
+	.if \windex == 79
+		// after 80 steps, the registers ABCDET has shifted from
+		// its orignal order of 012345 to 341520
+		// have to swap back for both compile- and run-time correctness
+		mov	v0.16b,v3.16b
+		.unreq VA
+		VA	.req v0
+
+		mov	vT0.16b,v2.16b
+		mov	v2.16b,v1.16b
+		mov	v1.16b,v4.16b
+		.unreq VB
+		VB	.req v1
+		.unreq VC
+		VC	.req v2
+
+		mov	v3.16b,v5.16b
+		.unreq VD
+		VD	.req v3
+
+		mov	v4.16b,vT0.16b
+		.unreq VE
+		VE	.req v4
+
+		.unreq VT
+		VT	.req v5
+	.endif
+.endm
+
+.macro exec_steps idx:req,more:vararg
+	exec_step	\idx
+	.ifnb \more
+		exec_steps	\more
+	.endif
+.endm
+
+.macro sha1_single
+	load_x4_word 0
+
+	mov	vAA.16B, VA.16B
+	mov	vBB.16B, VB.16B
+	mov	vCC.16B, VC.16B
+	mov	vDD.16B, VD.16B
+	mov	vEE.16B, VE.16B
+
+	adr	sha1key_adr, KEY_0
+	ld1	{VK.4s}, [sha1key_adr]
+	exec_steps	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
+
+	// 20 ~ 39
+	adr	sha1key_adr, KEY_1
+	ld1	{VK.4s}, [sha1key_adr]
+	exec_steps	20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
+
+	// 40 ~ 59
+	adr	sha1key_adr, KEY_2
+	ld1	{VK.4s}, [sha1key_adr]
+	exec_steps	40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
+
+	// 60 ~ 79
+	adr	sha1key_adr, KEY_3
+	ld1	{VK.4s}, [sha1key_adr]
+	exec_steps	60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79
+
+	add	VA.4s, vAA.4s, VA.4s
+	add	VB.4s, vBB.4s, VB.4s
+	add	VC.4s, vCC.4s, VC.4s
+	add	VD.4s, vDD.4s, VD.4s
+	add	VE.4s, vEE.4s, VE.4s
+.endm
+
+.macro sha1_asimd_save_stack
+	stp	d8,d9,[sp, -64]!
+	stp	d10,d11,[sp, 16]
+	stp	d12,d13,[sp, 32]
+	stp	d14,d15,[sp, 48]
+.endm
+
+.macro sha1_asimd_restore_stack
+	ldp	d10,d11,[sp, 16]
+	ldp	d12,d13,[sp, 32]
+	ldp	d14,d15,[sp, 48]
+	ldp	d8,d9,[sp],64
+.endm
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c
new file mode 100644
index 000000000..9a9952ff6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c
@@ -0,0 +1,250 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+void sha1_mb_mgr_init_asimd(SHA1_MB_JOB_MGR * state);
+SHA1_JOB *sha1_mb_mgr_submit_asimd(SHA1_MB_JOB_MGR * state, SHA1_JOB * job);
+SHA1_JOB *sha1_mb_mgr_flush_asimd(SHA1_MB_JOB_MGR * state);
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_asimd(SHA1_HASH_CTX_MGR * mgr)
+{
+	sha1_mb_mgr_init_asimd(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_asimd(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+					 const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_fixedlen(&ctx->partial_block_buffer
+					[ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_asimd(SHA1_HASH_CTX_MGR * mgr)
+{
+	SHA1_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_asimd(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+	while (ctx) {
+
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_fixedlen(ctx->partial_block_buffer,
+						((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA1_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA1_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_asimd(&mgr->mgr,
+										 &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+	static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+	    { SHA1_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+	    SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA1_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_asimd_slver_02020142;
+struct slver sha1_ctx_mgr_init_asimd_slver = { 0x0142, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_submit_asimd_slver_02020143;
+struct slver sha1_ctx_mgr_submit_asimd_slver = { 0x0143, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_flush_asimd_slver_02020144;
+struct slver sha1_ctx_mgr_flush_asimd_slver = { 0x0144, 0x02, 0x02 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c
new file mode 100644
index 000000000..e40a344ff
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c
@@ -0,0 +1,250 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+void sha1_mb_mgr_init_ce(SHA1_MB_JOB_MGR * state);
+SHA1_JOB *sha1_mb_mgr_submit_ce(SHA1_MB_JOB_MGR * state, SHA1_JOB * job);
+SHA1_JOB *sha1_mb_mgr_flush_ce(SHA1_MB_JOB_MGR * state);
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_ce(SHA1_HASH_CTX_MGR * mgr)
+{
+	sha1_mb_mgr_init_ce(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_ce(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+				      const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_fixedlen(&ctx->partial_block_buffer
+					[ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_ce(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_ce(SHA1_HASH_CTX_MGR * mgr)
+{
+	SHA1_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_ce(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+	while (ctx) {
+
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_fixedlen(ctx->partial_block_buffer,
+						((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA1_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA1_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_ce(&mgr->mgr,
+									      &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_ce(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+	static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+	    { SHA1_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+	    SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA1_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_ce_slver_02020142;
+struct slver sha1_ctx_mgr_init_ce_slver = { 0x0142, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_submit_ce_slver_02020143;
+struct slver sha1_ctx_mgr_submit_ce_slver = { 0x0143, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_flush_ce_slver_02020144;
+struct slver sha1_ctx_mgr_flush_ce_slver = { 0x0144, 0x02, 0x02 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c
new file mode 100644
index 000000000..0942c1a95
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c
@@ -0,0 +1,93 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(sha1_ctx_mgr_submit)
+{
+
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SHA1)
+		return PROVIDER_INFO(sha1_ctx_mgr_submit_ce);
+
+	if (auxval & HWCAP_ASIMD) {
+		switch (get_micro_arch_id()) {
+		case MICRO_ARCH_ID(ARM, NEOVERSE_N1):	// fall through
+		case MICRO_ARCH_ID(ARM, CORTEX_A57):	// fall through
+		case MICRO_ARCH_ID(ARM, CORTEX_A72):	// fall through
+			return PROVIDER_INFO(sha1_ctx_mgr_submit_asimd);
+		default:
+			break;
+		}
+	}
+
+	return PROVIDER_BASIC(sha1_ctx_mgr_submit);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sha1_ctx_mgr_init)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SHA1)
+		return PROVIDER_INFO(sha1_ctx_mgr_init_ce);
+
+	if (auxval & HWCAP_ASIMD) {
+		switch (get_micro_arch_id()) {
+		case MICRO_ARCH_ID(ARM, NEOVERSE_N1):	// fall through
+		case MICRO_ARCH_ID(ARM, CORTEX_A57):	// fall through
+		case MICRO_ARCH_ID(ARM, CORTEX_A72):	// fall through
+			return PROVIDER_INFO(sha1_ctx_mgr_init_asimd);
+		default:
+			break;
+		}
+	}
+
+	return PROVIDER_BASIC(sha1_ctx_mgr_init);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sha1_ctx_mgr_flush)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SHA1)
+		return PROVIDER_INFO(sha1_ctx_mgr_flush_ce);
+
+	if (auxval & HWCAP_ASIMD) {
+		switch (get_micro_arch_id()) {
+		case MICRO_ARCH_ID(ARM, NEOVERSE_N1):	// fall through
+		case MICRO_ARCH_ID(ARM, CORTEX_A57):	// fall through
+		case MICRO_ARCH_ID(ARM, CORTEX_A72):	// fall through
+			return PROVIDER_INFO(sha1_ctx_mgr_flush_asimd);
+		default:
+			break;
+		}
+	}
+
+	return PROVIDER_BASIC(sha1_ctx_mgr_flush);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_asimd_x4.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_asimd_x4.S
new file mode 100644
index 000000000..012b15c14
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_asimd_x4.S
@@ -0,0 +1,192 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+	.arch armv8-a
+
+#include "sha1_asimd_common.S"
+
+.macro internal_load windex
+	// load 64-bytes from each address to maximize usage of cache line
+	.if \windex == 0
+		mov     tmp,dataptr
+		ld1	{WORD0.4s},[data0],16
+		ld1	{WORD4.4s},[data0],16
+		ld1	{WORD8.4s},[data0],16
+		ld1	{WORD12.4s},[data0],16
+
+		ld1	{WORD1.4s},[data1],16
+		ld1	{WORD5.4s},[data1],16
+		ld1	{WORD9.4s},[data1],16
+		ld1	{WORD13.4s},[data1],16
+
+		ld1	{WORD2.4s},[data2],16
+		ld1	{WORD6.4s},[data2],16
+		ld1	{WORD10.4s},[data2],16
+		ld1	{WORD14.4s},[data2],16
+
+		ld1	{WORD3.4s},[data3],16
+		ld1	{WORD7.4s},[data3],16
+		ld1	{WORD11.4s},[data3],16
+		ld1	{WORD15.4s},[data3],16
+
+		st4	{WORD0.s,WORD1.s,WORD2.s,WORD3.s}[0],[tmp],16
+		st4	{WORD0.s,WORD1.s,WORD2.s,WORD3.s}[1],[tmp],16
+		st4	{WORD0.s,WORD1.s,WORD2.s,WORD3.s}[2],[tmp],16
+		st4	{WORD0.s,WORD1.s,WORD2.s,WORD3.s}[3],[tmp],16
+	.endif
+
+	.if \windex == 4
+		mov     tmp,dataptr
+		st4	{WORD4.s,WORD5.s,WORD6.s,WORD7.s}[0],[tmp],16
+		st4	{WORD4.s,WORD5.s,WORD6.s,WORD7.s}[1],[tmp],16
+		st4	{WORD4.s,WORD5.s,WORD6.s,WORD7.s}[2],[tmp],16
+		st4	{WORD4.s,WORD5.s,WORD6.s,WORD7.s}[3],[tmp],16
+	.endif
+
+	.if \windex == 8
+		mov     tmp,dataptr
+		st4	{WORD8.s,WORD9.s,WORD10.s,WORD11.s}[0],[tmp],16
+		st4	{WORD8.s,WORD9.s,WORD10.s,WORD11.s}[1],[tmp],16
+		st4	{WORD8.s,WORD9.s,WORD10.s,WORD11.s}[2],[tmp],16
+		st4	{WORD8.s,WORD9.s,WORD10.s,WORD11.s}[3],[tmp],16
+	.endif
+
+	.if \windex == 12
+		mov     tmp,dataptr
+		st4	{WORD12.s,WORD13.s,WORD14.s,WORD15.s}[0],[tmp],16
+		st4	{WORD12.s,WORD13.s,WORD14.s,WORD15.s}[1],[tmp],16
+		st4	{WORD12.s,WORD13.s,WORD14.s,WORD15.s}[2],[tmp],16
+		st4	{WORD12.s,WORD13.s,WORD14.s,WORD15.s}[3],[tmp],16
+	.endif
+.endm
+
+.macro load_x4_word idx:req
+	internal_load	\idx
+	ld1	{WORD\idx\().16b},[dataptr],16
+.endm
+
+/*
+ *  void sha1_mb_asimd_x4(SHA1_JOB *j0, SHA1_JOB*j1, SHA1_JOB*j2, SHA1_JOB *j3, int blocks)
+ */
+	job0	.req	x0
+	job1	.req	x1
+	job2	.req	x2
+	job3	.req	x3
+	num_blocks	.req	w4
+	tmp	.req	x5
+	data0	.req	x6
+	data1	.req	x7
+	data2	.req	x8
+	data3	.req	x9
+	databuf	.req	x10
+	dataptr	.req	x11
+	savedsp	.req	x12
+
+	.global sha1_mb_asimd_x4
+	.type sha1_mb_asimd_x4, %function
+sha1_mb_asimd_x4:
+	cmp	num_blocks, #0
+	beq	.return
+	sha1_asimd_save_stack
+	mov	savedsp,sp
+	sub	databuf,sp,256
+	mov	tmp,63
+	bic	databuf,databuf,tmp
+	mov	sp,databuf
+
+	add	tmp,job0,64
+	ld4 {VA.s,VB.s,VC.s,VD.s}[0],[tmp],#16
+	ld1	{VE.s}[0],[tmp]
+	ldr	data0,[job0]
+
+	add	tmp,job1,64
+	ld4 {VA.s,VB.s,VC.s,VD.s}[1],[tmp],#16
+	ld1	{VE.s}[1],[tmp]
+	ldr	data1,[job1]
+
+	add	tmp,job2,64
+	ld4 {VA.s,VB.s,VC.s,VD.s}[2],[tmp],#16
+	ld1	{VE.s}[2],[tmp]
+	ldr	data2,[job2]
+
+	add	tmp,job3,64
+	ld4 {VA.s,VB.s,VC.s,VD.s}[3],[tmp],#16
+	ld1	{VE.s}[3],[tmp]
+	ldr	data3,[job3]
+
+.block_loop:
+	mov	dataptr,databuf
+	sha1_single
+	subs	num_blocks, num_blocks, 1
+	bne	.block_loop
+
+	add	tmp,job0,64
+	st4 {VA.s,VB.s,VC.s,VD.s}[0],[tmp],#16
+	st1	{VE.s}[0],[tmp]
+
+	add	tmp,job1,64
+	st4 {VA.s,VB.s,VC.s,VD.s}[1],[tmp],#16
+	st1	{VE.s}[1],[tmp]
+
+	add	tmp,job2,64
+	st4 {VA.s,VB.s,VC.s,VD.s}[2],[tmp],#16
+	st1	{VE.s}[2],[tmp]
+
+	add	tmp,job3,64
+	st4 {VA.s,VB.s,VC.s,VD.s}[3],[tmp],#16
+	st1	{VE.s}[3],[tmp]
+
+	mov	sp,savedsp
+	sha1_asimd_restore_stack
+.return:
+	ret
+
+	.size sha1_mb_asimd_x4, .-sha1_mb_asimd_x4
+	.section .rodata.cst16,"aM",@progbits,16
+	.align  16
+KEY_0:
+	.word	0x5a827999
+	.word	0x5a827999
+	.word	0x5a827999
+	.word	0x5a827999
+KEY_1:
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+KEY_2:
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+KEY_3:
+	.word	0xca62c1d6
+	.word	0xca62c1d6
+	.word	0xca62c1d6
+	.word	0xca62c1d6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_asimd.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_asimd.c
new file mode 100644
index 000000000..4b34e7b53
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_asimd.c
@@ -0,0 +1,217 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <sha1_mb.h>
+#include <assert.h>
+#include "endian_helper.h"
+
+extern void sha1_aarch64_x1(const uint8_t * data, int num_blocks, uint32_t digest[]);
+static inline void sha1_job_x1(SHA1_JOB * job, int blocks)
+{
+	sha1_aarch64_x1(job->buffer, blocks, job->result_digest);
+}
+
+#ifndef min
+#define min(a,b)            (((a) < (b)) ? (a) : (b))
+#endif
+
+#define SHA1_MB_ASIMD_MAX_LANES	4
+void sha1_mb_asimd_x4(SHA1_JOB *, SHA1_JOB *, SHA1_JOB *, SHA1_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i)  	\
+	(((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i)  	\
+	(((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define	LANE_IS_FREE(state,i)		\
+	(((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i)	\
+	(((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+
+void sha1_mb_mgr_init_asimd(SHA1_MB_JOB_MGR * state)
+{
+	unsigned int i;
+
+	state->unused_lanes = 0xf;
+	state->num_lanes_inuse = 0;
+	for (i = 0; i < SHA1_MB_ASIMD_MAX_LANES; i++) {
+		state->unused_lanes <<= 4;
+		state->unused_lanes |= SHA1_MB_ASIMD_MAX_LANES - 1 - i;
+		state->lens[i] = i;
+		state->ldata[i].job_in_lane = 0;
+	}
+
+	// lanes > SHA1_MB_ASIMD_MAX_LANES is invalid lane
+	for (; i < SHA1_MAX_LANES; i++) {
+		state->lens[i] = 0xf;
+		state->ldata[i].job_in_lane = 0;
+	}
+}
+
+static int sha1_mb_mgr_do_jobs(SHA1_MB_JOB_MGR * state)
+{
+	int lane_idx, len, i, lanes, blocks;
+	int lane_idx_array[SHA1_MAX_LANES];
+
+	if (state->num_lanes_inuse == 0) {
+		return -1;
+	}
+	lanes = 0, len = 0;
+	for (i = 0; i < SHA1_MAX_LANES && lanes < state->num_lanes_inuse; i++) {
+		if (LANE_IS_NOT_FINISHED(state, i)) {
+			if (lanes)
+				len = min(len, state->lens[i]);
+			else
+				len = state->lens[i];
+			lane_idx_array[lanes] = i;
+			lanes++;
+		}
+	}
+
+	if (lanes == 0)
+		return -1;
+	lane_idx = len & 0xf;
+	len = len & (~0xf);
+	blocks = len >> 4;
+
+	/* for less-than-3-lane job, ASIMD really does not have much advantage
+	 * compared to scalar due to wasted >= 50% capacity
+	 * therefore we only run ASIMD for 3/4 lanes of data
+	 */
+	if (lanes == SHA1_MB_ASIMD_MAX_LANES) {
+		sha1_mb_asimd_x4(state->ldata[lane_idx_array[0]].job_in_lane,
+				 state->ldata[lane_idx_array[1]].job_in_lane,
+				 state->ldata[lane_idx_array[2]].job_in_lane,
+				 state->ldata[lane_idx_array[3]].job_in_lane, blocks);
+	} else if (lanes == 3) {
+		/* in case of 3 lanes, apparently ASIMD will still operate as if
+		 * there were four lanes of data in processing (waste 25% capacity)
+		 * theoretically we can let ASIMD implementation know the number of lanes
+		 * so that it could "at least" save some memory loading time
+		 * but in practice, we can just pass lane 0 as dummy for similar
+		 * cache performance
+		 */
+		SHA1_JOB dummy;
+		dummy.buffer = state->ldata[lane_idx_array[0]].job_in_lane->buffer;
+		dummy.len = state->ldata[lane_idx_array[0]].job_in_lane->len;
+		sha1_mb_asimd_x4(state->ldata[lane_idx_array[0]].job_in_lane,
+				 &dummy,
+				 state->ldata[lane_idx_array[1]].job_in_lane,
+				 state->ldata[lane_idx_array[2]].job_in_lane, blocks);
+	} else {
+		sha1_job_x1(state->ldata[lane_idx_array[0]].job_in_lane, blocks);
+		if (lanes >= 2) {
+			sha1_job_x1(state->ldata[lane_idx_array[1]].job_in_lane, blocks);
+		}
+	}
+
+	// only return the min length job
+	for (i = 0; i < SHA1_MAX_LANES; i++) {
+		if (LANE_IS_NOT_FINISHED(state, i)) {
+			state->lens[i] -= len;
+			state->ldata[i].job_in_lane->len -= len;
+			state->ldata[i].job_in_lane->buffer += len << 2;
+		}
+	}
+	return lane_idx;
+
+}
+
+static SHA1_JOB *sha1_mb_mgr_free_lane(SHA1_MB_JOB_MGR * state)
+{
+	int i;
+	SHA1_JOB *ret = NULL;
+
+	for (i = 0; i < SHA1_MB_ASIMD_MAX_LANES; i++) {
+		if (LANE_IS_FINISHED(state, i)) {
+			state->unused_lanes <<= 4;
+			state->unused_lanes |= i;
+			state->num_lanes_inuse--;
+			ret = state->ldata[i].job_in_lane;
+			ret->status = STS_COMPLETED;
+			state->ldata[i].job_in_lane = NULL;
+			break;
+		}
+	}
+	return ret;
+}
+
+static void sha1_mb_mgr_insert_job(SHA1_MB_JOB_MGR * state, SHA1_JOB * job)
+{
+	int lane_idx;
+	// add job into lanes
+	lane_idx = state->unused_lanes & 0xf;
+	// fatal error
+	assert(lane_idx < SHA1_MB_ASIMD_MAX_LANES);
+	state->lens[lane_idx] = (job->len << 4) | lane_idx;
+	state->ldata[lane_idx].job_in_lane = job;
+	state->unused_lanes >>= 4;
+	state->num_lanes_inuse++;
+}
+
+SHA1_JOB *sha1_mb_mgr_submit_asimd(SHA1_MB_JOB_MGR * state, SHA1_JOB * job)
+{
+#ifndef NDEBUG
+	int lane_idx;
+#endif
+	SHA1_JOB *ret;
+
+	// add job into lanes
+	sha1_mb_mgr_insert_job(state, job);
+
+	ret = sha1_mb_mgr_free_lane(state);
+	if (ret != NULL) {
+		return ret;
+	}
+	// submit will wait all lane has data
+	if (state->num_lanes_inuse < SHA1_MB_ASIMD_MAX_LANES)
+		return NULL;
+#ifndef NDEBUG
+	lane_idx = sha1_mb_mgr_do_jobs(state);
+	assert(lane_idx != -1);
+#else
+	sha1_mb_mgr_do_jobs(state);
+#endif
+
+	// ~ i = lane_idx;
+	ret = sha1_mb_mgr_free_lane(state);
+	return ret;
+}
+
+SHA1_JOB *sha1_mb_mgr_flush_asimd(SHA1_MB_JOB_MGR * state)
+{
+	SHA1_JOB *ret;
+	ret = sha1_mb_mgr_free_lane(state);
+	if (ret) {
+		return ret;
+	}
+
+	sha1_mb_mgr_do_jobs(state);
+	return sha1_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c
new file mode 100644
index 000000000..1dfd67d0c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c
@@ -0,0 +1,208 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <sha1_mb.h>
+#include <assert.h>
+
+#ifndef max
+#define max(a,b)            (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b)            (((a) < (b)) ? (a) : (b))
+#endif
+
+#define SHA1_MB_CE_MAX_LANES	2
+#if SHA1_MB_CE_MAX_LANES >=2
+void sha1_mb_ce_x2(SHA1_JOB *, SHA1_JOB *, int);
+#endif
+void sha1_mb_ce_x1(SHA1_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i)  	\
+	(((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i)  	\
+	(((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define	LANE_IS_FREE(state,i)		\
+	(((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i)	\
+	(((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+void sha1_mb_mgr_init_ce(SHA1_MB_JOB_MGR * state)
+{
+	unsigned int i;
+
+	state->unused_lanes = 0xf;
+	state->num_lanes_inuse = 0;
+	for (i = 0; i < SHA1_MB_CE_MAX_LANES; i++) {
+		state->unused_lanes <<= 4;
+		state->unused_lanes |= i;
+		state->lens[i] = i;
+		state->ldata[i].job_in_lane = 0;
+	}
+
+	//lanes > SHA1_MB_CE_MAX_LANES is invalid lane
+	for (; i < SHA1_MAX_LANES; i++) {
+		state->lens[i] = 0xf;
+		state->ldata[i].job_in_lane = 0;
+	}
+}
+
+static int sha1_mb_mgr_do_jobs(SHA1_MB_JOB_MGR * state)
+{
+	int lane_idx, len, i, lanes;
+
+	int lane_idx_array[SHA1_MAX_LANES];
+
+	if (state->num_lanes_inuse == 0) {
+		return -1;
+	}
+#if	SHA1_MB_CE_MAX_LANES == 2
+	if (state->num_lanes_inuse == 2) {
+		len = min(state->lens[0], state->lens[1]);
+		lane_idx = len & 0xf;
+		len &= ~0xf;
+
+		sha1_mb_ce_x2(state->ldata[0].job_in_lane,
+			      state->ldata[1].job_in_lane, len >> 4);
+
+	} else
+#endif
+	{
+		lanes = 0, len = 0;
+		for (i = 0; i < SHA1_MAX_LANES && lanes < state->num_lanes_inuse; i++) {
+			if (LANE_IS_NOT_FINISHED(state, i)) {
+				if (lanes)
+					len = min(len, state->lens[i]);
+				else
+					len = state->lens[i];
+				lane_idx_array[lanes] = i;
+				lanes++;
+			}
+		}
+		if (lanes == 0)
+			return -1;
+		lane_idx = len & 0xf;
+		len = len & (~0xf);
+
+#if SHA1_MB_CE_MAX_LANES >=2
+		if (lanes == 2) {
+			sha1_mb_ce_x2(state->ldata[lane_idx_array[0]].job_in_lane,
+				      state->ldata[lane_idx_array[1]].job_in_lane, len >> 4);
+		} else
+#endif
+		{
+			sha1_mb_ce_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4);
+		}
+	}
+	//only return the min length job
+	for (i = 0; i < SHA1_MAX_LANES; i++) {
+		if (LANE_IS_NOT_FINISHED(state, i)) {
+			state->lens[i] -= len;
+			state->ldata[i].job_in_lane->len -= len;
+			state->ldata[i].job_in_lane->buffer += len << 2;
+		}
+	}
+
+	return lane_idx;
+
+}
+
+static SHA1_JOB *sha1_mb_mgr_free_lane(SHA1_MB_JOB_MGR * state)
+{
+	int i;
+	SHA1_JOB *ret = NULL;
+
+	for (i = 0; i < SHA1_MB_CE_MAX_LANES; i++) {
+		if (LANE_IS_FINISHED(state, i)) {
+
+			state->unused_lanes <<= 4;
+			state->unused_lanes |= i;
+			state->num_lanes_inuse--;
+			ret = state->ldata[i].job_in_lane;
+			ret->status = STS_COMPLETED;
+			state->ldata[i].job_in_lane = NULL;
+			break;
+		}
+	}
+	return ret;
+}
+
+static void sha1_mb_mgr_insert_job(SHA1_MB_JOB_MGR * state, SHA1_JOB * job)
+{
+	int lane_idx;
+	//add job into lanes
+	lane_idx = state->unused_lanes & 0xf;
+	//fatal error
+	assert(lane_idx < SHA1_MB_CE_MAX_LANES);
+	state->lens[lane_idx] = (job->len << 4) | lane_idx;
+	state->ldata[lane_idx].job_in_lane = job;
+	state->unused_lanes >>= 4;
+	state->num_lanes_inuse++;
+}
+
+SHA1_JOB *sha1_mb_mgr_submit_ce(SHA1_MB_JOB_MGR * state, SHA1_JOB * job)
+{
+#ifndef NDEBUG
+	int lane_idx;
+#endif
+	SHA1_JOB *ret;
+
+	//add job into lanes
+	sha1_mb_mgr_insert_job(state, job);
+
+	ret = sha1_mb_mgr_free_lane(state);
+	if (ret != NULL) {
+		return ret;
+	}
+	//submit will wait all lane has data
+	if (state->num_lanes_inuse < SHA1_MB_CE_MAX_LANES)
+		return NULL;
+#ifndef NDEBUG
+	lane_idx = sha1_mb_mgr_do_jobs(state);
+	assert(lane_idx != -1);
+#else
+	sha1_mb_mgr_do_jobs(state);
+#endif
+
+	//~ i = lane_idx;
+	ret = sha1_mb_mgr_free_lane(state);
+	return ret;
+}
+
+SHA1_JOB *sha1_mb_mgr_flush_ce(SHA1_MB_JOB_MGR * state)
+{
+	SHA1_JOB *ret;
+	ret = sha1_mb_mgr_free_lane(state);
+	if (ret) {
+		return ret;
+	}
+
+	sha1_mb_mgr_do_jobs(state);
+	return sha1_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_multibinary.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_multibinary.S
new file mode 100644
index 000000000..bb1929d76
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_multibinary.S
@@ -0,0 +1,36 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include "aarch64_multibinary.h"
+
+
+mbin_interface sha1_ctx_mgr_submit
+mbin_interface sha1_ctx_mgr_init
+mbin_interface sha1_ctx_mgr_flush
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S
new file mode 100644
index 000000000..22f736793
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S
@@ -0,0 +1,194 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+	.align	2
+	.p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro	declare_var_vector_reg name:req,reg:req
+	\name\()_q	.req	q\reg
+	\name\()_v	.req	v\reg
+	\name\()_s	.req	s\reg
+.endm
+
+/**
+maros for round 4-67
+*/
+.macro sha1_4_rounds inst:req,msg0:req,msg1:req,msg2:req,msg3:req,abcd:req,e0:req,tmp0:req,e1:req,tmp1:req,k:req
+	sha1h	\e0\()_s, \abcd\()_s
+	\inst	\abcd\()_q,\e1\()_s,\tmp1\()_v.4s
+	add 	\tmp1\()_v.4s,\msg3\()_v.4s,\k\()_v.4s
+	sha1su1	\msg0\()_v.4s,\msg3\()_v.4s
+	sha1su0	\msg1\()_v.4s,\msg2\()_v.4s,\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+	declare_var_vector_reg	key_0,28
+	declare_var_vector_reg	key_1,29
+	declare_var_vector_reg	key_2,30
+	declare_var_vector_reg	key_3,31
+
+
+/*
+digest variables
+*/
+	declare_var_vector_reg	abcd,0
+	declare_var_vector_reg	e0,1
+	declare_var_vector_reg	e1,2
+	declare_var_vector_reg	abcd_saved,3
+	declare_var_vector_reg	e0_saved,4
+/*
+Message variables
+*/
+	declare_var_vector_reg	msg_0,16
+	declare_var_vector_reg	msg_1,17
+	declare_var_vector_reg	msg_2,18
+	declare_var_vector_reg	msg_3,19
+/*
+Temporay variables
+*/
+	declare_var_vector_reg	tmp_0,5
+	declare_var_vector_reg	tmp_1,6
+
+/*
+	void sha1_mb_ce_x1(SHA1_JOB * job, int len);
+*/
+/*
+Arguements list
+*/
+	job 	.req	x0
+	len	.req	w1
+	data	.req	x2
+	tmp	.req	x3
+	.global	sha1_mb_ce_x1
+	.type	sha1_mb_ce_x1, %function
+sha1_mb_ce_x1:
+	ldr	data, [job]
+	ldr	abcd_q, [job, 64]
+	ldr	e0_s, [job, 80]
+	adr	tmp, KEY
+	ld1	{key_0_v.4s-key_3_v.4s},[tmp]
+
+start_loop:
+
+	//load msgs
+	ld1	{msg_0_v.4s-msg_3_v.4s},[data]
+
+	//adjust loop parameter
+	add	data,data,64
+	sub	len, len, #1
+	cmp	len, 0
+	//backup digest
+	mov	abcd_saved_v.16b,abcd_v.16b
+	mov	e0_saved_v.16b,e0_v.16b
+
+	rev32	msg_0_v.16b,msg_0_v.16b
+	rev32	msg_1_v.16b,msg_1_v.16b
+	add	tmp_0_v.4s,msg_0_v.4s,key_0_v.4s
+	rev32	msg_2_v.16b,msg_2_v.16b
+	add	tmp_1_v.4s,msg_1_v.4s,key_0_v.4s
+	rev32	msg_3_v.16b,msg_3_v.16b
+
+	/* rounds 0-3 */
+	sha1h	e1_s,abcd_s
+	sha1c	abcd_q,e0_s,tmp_0_v.4s
+	add	tmp_0_v.4s,msg_2_v.4s,key_0_v.4s
+	sha1su0	msg_0_v.4s,msg_1_v.4s,msg_2_v.4s
+
+	sha1_4_rounds	sha1c,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_0     /* rounds 4-7 */
+	sha1_4_rounds	sha1c,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_0
+	sha1_4_rounds	sha1c,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1     /* rounds 12-15 */
+	sha1_4_rounds	sha1c,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_1
+	sha1_4_rounds	sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_1     /* rounds 20-23 */
+	sha1_4_rounds	sha1p,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_1
+	sha1_4_rounds	sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1
+	sha1_4_rounds	sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2
+	sha1_4_rounds	sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_2     /* rounds 36-39 */
+	sha1_4_rounds	sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_2
+	sha1_4_rounds	sha1m,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_2
+	sha1_4_rounds	sha1m,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2
+	sha1_4_rounds	sha1m,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_3     /* rounds 52-55 */
+	sha1_4_rounds	sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_3
+	sha1_4_rounds	sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_3
+	sha1_4_rounds	sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_3
+
+	/* rounds 68-71 */
+	sha1h   e0_s,abcd_s
+	sha1p   abcd_q,e1_s,tmp_1_v.4s
+	add     tmp_1_v.4s,msg_3_v.4s,key_3_v.4s
+	sha1su1 msg_0_v.4s,msg_3_v.4s
+
+	/* rounds 72-75 */
+	sha1h   e1_s,abcd_s
+	sha1p   abcd_q,e0_s,tmp_0_v.4s
+
+	/* rounds 76-79 */
+	sha1h   e0_s,abcd_s
+	sha1p   abcd_q,e1_s,tmp_1_v.4s
+
+
+
+	add     abcd_v.4s,abcd_v.4s,abcd_saved_v.4s
+	add     e0_v.2s,e0_v.2s,e0_saved_v.2s
+
+
+	bgt	start_loop
+	str	abcd_q,	[job, 64]
+	str	e0_s, 	[job, 80]
+
+	ret
+
+	.size	sha1_mb_ce_x1, .-sha1_mb_ce_x1
+	.section	.rodata.cst16,"aM",@progbits,16
+	.align	4
+KEY:
+	.word	0x5a827999
+	.word	0x5a827999
+	.word	0x5a827999
+	.word	0x5a827999
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+	.word	0xca62c1d6
+	.word	0xca62c1d6
+	.word	0xca62c1d6
+	.word	0xca62c1d6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S
new file mode 100644
index 000000000..93f653ad2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S
@@ -0,0 +1,253 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+	.align	2
+	.p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro	declare_var_vector_reg name:req,reg:req
+	\name\()_q	.req	q\reg
+	\name\()_v	.req	v\reg
+	\name\()_s	.req	s\reg
+.endm
+
+/**
+maros for round 4-67
+*/
+.macro sha1_4_rounds inst:req,msg0:req,msg1:req,msg2:req,msg3:req,abcd:req,e0:req,tmp0:req,e1:req,tmp1:req,k:req
+	sha1h	l0_\e0\()_s, l0_\abcd\()_s
+	sha1h	l1_\e0\()_s, l1_\abcd\()_s
+
+	\inst	l0_\abcd\()_q,l0_\e1\()_s,l0_\tmp1\()_v.4s
+	\inst	l1_\abcd\()_q,l1_\e1\()_s,l1_\tmp1\()_v.4s
+
+	add 	l0_\tmp1\()_v.4s,l0_\msg3\()_v.4s,\k\()_v.4s
+	add 	l1_\tmp1\()_v.4s,l1_\msg3\()_v.4s,\k\()_v.4s
+
+	sha1su1	l0_\msg0\()_v.4s,l0_\msg3\()_v.4s
+	sha1su1	l1_\msg0\()_v.4s,l1_\msg3\()_v.4s
+
+	sha1su0	l0_\msg1\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+	sha1su0	l1_\msg1\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+	declare_var_vector_reg	key_0,28
+	declare_var_vector_reg	key_1,29
+	declare_var_vector_reg	key_2,30
+	declare_var_vector_reg	key_3,31
+
+
+/*
+lane variables
+*/
+	declare_var_vector_reg	l0_abcd,0
+	declare_var_vector_reg	l0_e0,1
+	declare_var_vector_reg	l0_e1,2
+	declare_var_vector_reg	l0_abcd_saved,3
+	declare_var_vector_reg	l0_e0_saved,4
+	declare_var_vector_reg	l0_tmp_0,5
+	declare_var_vector_reg	l0_tmp_1,6
+	declare_var_vector_reg	l0_msg_0,16
+	declare_var_vector_reg	l0_msg_1,17
+	declare_var_vector_reg	l0_msg_2,18
+	declare_var_vector_reg	l0_msg_3,19
+
+	declare_var_vector_reg	l1_abcd,7
+	declare_var_vector_reg	l1_e0,8
+	declare_var_vector_reg	l1_e1,9
+	declare_var_vector_reg	l1_abcd_saved,24
+	declare_var_vector_reg	l1_e0_saved,25
+	declare_var_vector_reg	l1_tmp_0,26
+	declare_var_vector_reg	l1_tmp_1,27
+	declare_var_vector_reg	l1_msg_0,20
+	declare_var_vector_reg	l1_msg_1,21
+	declare_var_vector_reg	l1_msg_2,22
+	declare_var_vector_reg	l1_msg_3,23
+
+/*
+	void sha1_mb_ce_x2(SHA1_JOB * job_0, SHA1_JOB * job_1,int len);
+*/
+	l0_job .req x0
+	l1_job .req x1
+	len .req w2
+
+	l0_data		.req	x3
+	l1_data		.req	x4
+	tmp		.req	x5
+	.global	sha1_mb_ce_x2
+	.type	sha1_mb_ce_x2, %function
+sha1_mb_ce_x2:
+	//push 	d8,d9 to stack
+	stp	d8, d9, [sp, -256]!
+
+	adr	tmp, KEY
+	ld1	{key_0_v.4s-key_3_v.4s},[tmp]
+	ldr	l0_data,	[l0_job]
+	ldr	l1_data,	[l1_job]
+	ldr	l0_abcd_q,	[l0_job, 64]
+	ldr	l0_e0_s,	[l0_job, 80]
+	ldr	l1_abcd_q,	[l1_job, 64]
+	ldr	l1_e0_s,	[l1_job, 80]
+
+start_loop:
+
+	//load msgs
+	ld1	{l0_msg_0_v.4s-l0_msg_3_v.4s},[l0_data]
+	ld1	{l1_msg_0_v.4s-l1_msg_3_v.4s},[l1_data]
+
+	//adjust loop parameter
+	add	l0_data,l0_data,64
+	add	l1_data,l1_data,64
+	sub	len, len, #1
+	cmp	len, 0
+	//backup digest
+	mov	l0_abcd_saved_v.16b,	l0_abcd_v.16b
+	mov	l0_e0_saved_v.16b,	l0_e0_v.16b
+	mov	l1_abcd_saved_v.16b,	l1_abcd_v.16b
+	mov	l1_e0_saved_v.16b,	l1_e0_v.16b
+
+	rev32	l0_msg_0_v.16b,	l0_msg_0_v.16b
+	rev32	l0_msg_1_v.16b,	l0_msg_1_v.16b
+	add	l0_tmp_0_v.4s,	l0_msg_0_v.4s,	key_0_v.4s
+	rev32	l0_msg_2_v.16b,	l0_msg_2_v.16b
+	add	l0_tmp_1_v.4s,	l0_msg_1_v.4s,	key_0_v.4s
+	rev32	l0_msg_3_v.16b,	l0_msg_3_v.16b
+
+	rev32	l1_msg_0_v.16b,	l1_msg_0_v.16b
+	rev32	l1_msg_1_v.16b,	l1_msg_1_v.16b
+	add	l1_tmp_0_v.4s,	l1_msg_0_v.4s,	key_0_v.4s
+	rev32	l1_msg_2_v.16b,	l1_msg_2_v.16b
+	add	l1_tmp_1_v.4s,	l1_msg_1_v.4s,	key_0_v.4s
+	rev32	l1_msg_3_v.16b,	l1_msg_3_v.16b
+
+	/* rounds 0-3 */
+	sha1h	l0_e1_s,	l0_abcd_s
+	sha1c	l0_abcd_q,	l0_e0_s,	l0_tmp_0_v.4s
+	add	l0_tmp_0_v.4s,	l0_msg_2_v.4s,	key_0_v.4s
+	sha1su0	l0_msg_0_v.4s,	l0_msg_1_v.4s,	l0_msg_2_v.4s
+
+	sha1h	l1_e1_s,	l1_abcd_s
+	sha1c	l1_abcd_q,	l1_e0_s,	l1_tmp_0_v.4s
+	add	l1_tmp_0_v.4s,	l1_msg_2_v.4s,	key_0_v.4s
+	sha1su0	l1_msg_0_v.4s,	l1_msg_1_v.4s,	l1_msg_2_v.4s
+
+	sha1_4_rounds	sha1c,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_0     /* rounds 4-7 */
+	sha1_4_rounds	sha1c,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_0
+	sha1_4_rounds	sha1c,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1     /* rounds 12-15 */
+	sha1_4_rounds	sha1c,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_1
+	sha1_4_rounds	sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_1     /* rounds 20-23 */
+	sha1_4_rounds	sha1p,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_1
+	sha1_4_rounds	sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1
+	sha1_4_rounds	sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2
+	sha1_4_rounds	sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_2     /* rounds 36-39 */
+	sha1_4_rounds	sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_2
+	sha1_4_rounds	sha1m,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_2
+	sha1_4_rounds	sha1m,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2
+	sha1_4_rounds	sha1m,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_3     /* rounds 52-55 */
+	sha1_4_rounds	sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_3
+	sha1_4_rounds	sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_3
+	sha1_4_rounds	sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_3
+
+	/* rounds 68-71 */
+	sha1h	l0_e0_s,	l0_abcd_s
+	sha1p	l0_abcd_q,	l0_e1_s,	l0_tmp_1_v.4s
+	add	l0_tmp_1_v.4s,	l0_msg_3_v.4s,	key_3_v.4s
+	sha1su1	l0_msg_0_v.4s,	l0_msg_3_v.4s
+
+	sha1h	l1_e0_s,	l1_abcd_s
+	sha1p	l1_abcd_q,	l1_e1_s,	l1_tmp_1_v.4s
+	add	l1_tmp_1_v.4s,	l1_msg_3_v.4s,	key_3_v.4s
+	sha1su1	l1_msg_0_v.4s,	l1_msg_3_v.4s
+
+	/* rounds 72-75 */
+	sha1h   l0_e1_s,	l0_abcd_s
+	sha1p   l0_abcd_q,	l0_e0_s,	l0_tmp_0_v.4s
+
+	sha1h   l1_e1_s,	l1_abcd_s
+	sha1p   l1_abcd_q,	l1_e0_s,	l1_tmp_0_v.4s
+
+	/* rounds 76-79 */
+	sha1h   l0_e0_s,	l0_abcd_s
+	sha1p   l0_abcd_q,	l0_e1_s,	l0_tmp_1_v.4s
+
+	sha1h   l1_e0_s,	l1_abcd_s
+	sha1p   l1_abcd_q,	l1_e1_s,	l1_tmp_1_v.4s
+
+
+
+	add     l0_abcd_v.4s,	l0_abcd_v.4s,	l0_abcd_saved_v.4s
+	add     l0_e0_v.2s,	l0_e0_v.2s,	l0_e0_saved_v.2s
+	add     l1_abcd_v.4s,	l1_abcd_v.4s,	l1_abcd_saved_v.4s
+	add     l1_e0_v.2s,	l1_e0_v.2s,	l1_e0_saved_v.2s
+
+
+
+
+	bgt	start_loop
+
+	str	l0_abcd_q,	[l0_job, 64]
+	str	l0_e0_s, 	[l0_job, 80]
+
+
+	str	l1_abcd_q,	[l1_job, 64]
+	str	l1_e0_s, 	[l1_job, 80]
+
+	//pop d8,d9 from stack
+	ldp	d8, d9, [sp], 256
+	ret
+
+	.size	sha1_mb_ce_x2, .-sha1_mb_ce_x2
+	.section	.rodata.cst16,"aM",@progbits,16
+	.align	4
+KEY:
+	.word	0x5a827999
+	.word	0x5a827999
+	.word	0x5a827999
+	.word	0x5a827999
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+	.word	0x6ed9eba1
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+	.word	0x8f1bbcdc
+	.word	0xca62c1d6
+	.word	0xca62c1d6
+	.word	0xca62c1d6
+	.word	0xca62c1d6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c
new file mode 100644
index 000000000..ad91d64ac
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c
@@ -0,0 +1,265 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx")
+#endif
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_avx(SHA1_HASH_CTX_MGR * mgr)
+{
+	sha1_mb_mgr_init_avx(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+				       const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_fixedlen(&ctx->partial_block_buffer
+					[ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx(SHA1_HASH_CTX_MGR * mgr)
+{
+	SHA1_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+	while (ctx) {
+
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_fixedlen(ctx->partial_block_buffer,
+						((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA1_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA1_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx(&mgr->mgr,
+									       &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+	static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+	    { SHA1_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+	    SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA1_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_avx_slver_02020142;
+struct slver sha1_ctx_mgr_init_avx_slver = { 0x0142, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_submit_avx_slver_02020143;
+struct slver sha1_ctx_mgr_submit_avx_slver = { 0x0143, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_flush_avx_slver_02020144;
+struct slver sha1_ctx_mgr_flush_avx_slver = { 0x0144, 0x02, 0x02 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c
new file mode 100644
index 000000000..85977d4c2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c
@@ -0,0 +1,264 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_avx2(SHA1_HASH_CTX_MGR * mgr)
+{
+	sha1_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx2(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+					const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx2(SHA1_HASH_CTX_MGR * mgr)
+{
+	SHA1_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx2(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_fixedlen(ctx->partial_block_buffer,
+						((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA1_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA1_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx2(&mgr->mgr,
+										&ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+	static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+	    { SHA1_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+	    SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA1_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_avx2_slver_04020145;
+struct slver sha1_ctx_mgr_init_avx2_slver = { 0x0145, 0x02, 0x04 };
+
+struct slver sha1_ctx_mgr_submit_avx2_slver_04020146;
+struct slver sha1_ctx_mgr_submit_avx2_slver = { 0x0146, 0x02, 0x04 };
+
+struct slver sha1_ctx_mgr_flush_avx2_slver_04020147;
+struct slver sha1_ctx_mgr_flush_avx2_slver = { 0x0147, 0x02, 0x04 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c
new file mode 100644
index 000000000..90e087163
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c
@@ -0,0 +1,271 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_avx512(SHA1_HASH_CTX_MGR * mgr)
+{
+	sha1_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx512(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+					  const void *buffer, uint32_t len,
+					  HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx =
+			    (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx512(SHA1_HASH_CTX_MGR * mgr)
+{
+	SHA1_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx512(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_fixedlen(ctx->partial_block_buffer,
+						((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA1_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA1_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr,
+										  &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx =
+			    (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+	static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+	    { SHA1_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+	    SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA1_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_avx512_slver_0600014a;
+struct slver sha1_ctx_mgr_init_avx512_slver = { 0x014a, 0x00, 0x06 };
+
+struct slver sha1_ctx_mgr_submit_avx512_slver_0600014b;
+struct slver sha1_ctx_mgr_submit_avx512_slver = { 0x014b, 0x00, 0x06 };
+
+struct slver sha1_ctx_mgr_flush_avx512_slver_0600014c;
+struct slver sha1_ctx_mgr_flush_avx512_slver = { 0x014c, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c
new file mode 100644
index 000000000..2013f829a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c
@@ -0,0 +1,281 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+/**
+ *  sha1_ctx_avx512_ni related functions are aiming to utilize Canon Lake.
+ *  Since SHANI is still slower than multibuffer for full lanes,
+ *  sha1_ctx_mgr_init_avx512_ni and sha1_ctx_mgr_submit_avx512_ni are
+ *  similar with their avx512 versions.
+ *  sha1_ctx_mgr_flush_avx512_ni is different. It will call
+ *  sha1_mb_mgr_flush_avx512_ni which would use shani when lanes are less
+ *  than a threshold.
+ *
+ */
+#if defined(HAVE_AS_KNOWS_AVX512) && defined(HAVE_AS_KNOWS_SHANI)
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_avx512_ni(SHA1_HASH_CTX_MGR * mgr)
+{
+	sha1_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx512_ni(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+					     const void *buffer, uint32_t len,
+					     HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx =
+			    (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx512_ni(SHA1_HASH_CTX_MGR * mgr)
+{
+	SHA1_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx512_ni(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_fixedlen(ctx->partial_block_buffer,
+						((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA1_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA1_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr,
+										  &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx =
+			    (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+	static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+	    { SHA1_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+	    SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA1_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_avx512_ni_slver_080002c4;
+struct slver sha1_ctx_mgr_init_avx512_ni_slver = { 0x02c4, 0x00, 0x08 };
+
+struct slver sha1_ctx_mgr_submit_avx512_ni_slver_080002c5;
+struct slver sha1_ctx_mgr_submit_avx512_ni_slver = { 0x02c5, 0x00, 0x08 };
+
+struct slver sha1_ctx_mgr_flush_avx512_ni_slver_080002c6;
+struct slver sha1_ctx_mgr_flush_avx512_ni_slver = { 0x02c6, 0x00, 0x08 };
+
+#endif // HAVE_AS_KNOWS_AVX512 and HAVE_AS_KNOWS_SHANI
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c
new file mode 100644
index 000000000..90481efd0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c
@@ -0,0 +1,325 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define F1(b,c,d) (d ^ (b & (c ^ d)))
+#define F2(b,c,d) (b ^ c ^ d)
+#define F3(b,c,d) ((b & c) | (d & (b | c)))
+#define F4(b,c,d) (b ^ c ^ d)
+
+#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r))))
+
+#define W(x) w[(x) & 15]
+
+#define step00_19(i,a,b,c,d,e) \
+	if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+	else W(i) = to_be32(ww[i]); \
+	e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \
+	b = rol32(b,30)
+
+#define step20_39(i,a,b,c,d,e) \
+	W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+	e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \
+	b = rol32(b,30)
+
+#define step40_59(i,a,b,c,d,e) \
+	W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+	e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \
+	b = rol32(b,30)
+
+#define step60_79(i,a,b,c,d,e) \
+	W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+	e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \
+	b = rol32(b,30)
+
+static void sha1_init(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static uint32_t sha1_update(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static void sha1_final(SHA1_HASH_CTX * ctx, uint32_t remain_len);
+static void OPT_FIX sha1_single(const void *data, uint32_t digest[]);
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+
+void sha1_ctx_mgr_init_base(SHA1_HASH_CTX_MGR * mgr)
+{
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_base(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+					const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	uint32_t remain_len;
+
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) {
+		// Cannot submit a new entire job to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags == HASH_FIRST) {
+
+		sha1_init(ctx, buffer, len);
+		sha1_update(ctx, buffer, len);
+	}
+
+	if (flags == HASH_UPDATE) {
+		sha1_update(ctx, buffer, len);
+	}
+
+	if (flags == HASH_LAST) {
+		remain_len = sha1_update(ctx, buffer, len);
+		sha1_final(ctx, remain_len);
+	}
+
+	if (flags == HASH_ENTIRE) {
+		sha1_init(ctx, buffer, len);
+		remain_len = sha1_update(ctx, buffer, len);
+		sha1_final(ctx, remain_len);
+	}
+
+	return ctx;
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_base(SHA1_HASH_CTX_MGR * mgr)
+{
+	return NULL;
+}
+
+static void sha1_init(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+	// Init digest
+	hash_init_digest(ctx->job.result_digest);
+
+	// Reset byte counter
+	ctx->total_length = 0;
+
+	// Clear extra blocks
+	ctx->partial_block_buffer_length = 0;
+
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Mark it as processing
+	ctx->status = HASH_CTX_STS_PROCESSING;
+}
+
+static uint32_t sha1_update(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+	uint32_t remain_len = len;
+	uint32_t *digest = ctx->job.result_digest;
+
+	while (remain_len >= SHA1_BLOCK_SIZE) {
+		sha1_single(buffer, digest);
+		buffer = (void *)((uint8_t *) buffer + SHA1_BLOCK_SIZE);
+		remain_len -= SHA1_BLOCK_SIZE;
+		ctx->total_length += SHA1_BLOCK_SIZE;
+	}
+
+	ctx->status = HASH_CTX_STS_IDLE;
+	ctx->incoming_buffer = buffer;
+	return remain_len;
+}
+
+static void sha1_final(SHA1_HASH_CTX * ctx, uint32_t remain_len)
+{
+	const void *buffer = ctx->incoming_buffer;
+	uint32_t i = remain_len, j;
+	uint8_t buf[2 * SHA1_BLOCK_SIZE];
+	uint32_t *digest = ctx->job.result_digest;
+
+	ctx->total_length += i;
+	memcpy(buf, buffer, i);
+	buf[i++] = 0x80;
+	for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - SHA1_PADLENGTHFIELD_SIZE); j++)
+		buf[j] = 0;
+
+	if (i > SHA1_BLOCK_SIZE - SHA1_PADLENGTHFIELD_SIZE)
+		i = 2 * SHA1_BLOCK_SIZE;
+	else
+		i = SHA1_BLOCK_SIZE;
+
+	*(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8);
+
+	sha1_single(buf, digest);
+	if (i == 2 * SHA1_BLOCK_SIZE) {
+		sha1_single(buf + SHA1_BLOCK_SIZE, digest);
+	}
+
+	ctx->status = HASH_CTX_STS_COMPLETE;
+}
+
+void sha1_single(const void *data, uint32_t digest[])
+{
+	uint32_t a, b, c, d, e;
+	uint32_t w[16] = { 0 };
+	uint32_t *ww = (uint32_t *) data;
+
+	a = digest[0];
+	b = digest[1];
+	c = digest[2];
+	d = digest[3];
+	e = digest[4];
+
+	step00_19(0, a, b, c, d, e);
+	step00_19(1, e, a, b, c, d);
+	step00_19(2, d, e, a, b, c);
+	step00_19(3, c, d, e, a, b);
+	step00_19(4, b, c, d, e, a);
+	step00_19(5, a, b, c, d, e);
+	step00_19(6, e, a, b, c, d);
+	step00_19(7, d, e, a, b, c);
+	step00_19(8, c, d, e, a, b);
+	step00_19(9, b, c, d, e, a);
+	step00_19(10, a, b, c, d, e);
+	step00_19(11, e, a, b, c, d);
+	step00_19(12, d, e, a, b, c);
+	step00_19(13, c, d, e, a, b);
+	step00_19(14, b, c, d, e, a);
+	step00_19(15, a, b, c, d, e);
+	step00_19(16, e, a, b, c, d);
+	step00_19(17, d, e, a, b, c);
+	step00_19(18, c, d, e, a, b);
+	step00_19(19, b, c, d, e, a);
+
+	step20_39(20, a, b, c, d, e);
+	step20_39(21, e, a, b, c, d);
+	step20_39(22, d, e, a, b, c);
+	step20_39(23, c, d, e, a, b);
+	step20_39(24, b, c, d, e, a);
+	step20_39(25, a, b, c, d, e);
+	step20_39(26, e, a, b, c, d);
+	step20_39(27, d, e, a, b, c);
+	step20_39(28, c, d, e, a, b);
+	step20_39(29, b, c, d, e, a);
+	step20_39(30, a, b, c, d, e);
+	step20_39(31, e, a, b, c, d);
+	step20_39(32, d, e, a, b, c);
+	step20_39(33, c, d, e, a, b);
+	step20_39(34, b, c, d, e, a);
+	step20_39(35, a, b, c, d, e);
+	step20_39(36, e, a, b, c, d);
+	step20_39(37, d, e, a, b, c);
+	step20_39(38, c, d, e, a, b);
+	step20_39(39, b, c, d, e, a);
+
+	step40_59(40, a, b, c, d, e);
+	step40_59(41, e, a, b, c, d);
+	step40_59(42, d, e, a, b, c);
+	step40_59(43, c, d, e, a, b);
+	step40_59(44, b, c, d, e, a);
+	step40_59(45, a, b, c, d, e);
+	step40_59(46, e, a, b, c, d);
+	step40_59(47, d, e, a, b, c);
+	step40_59(48, c, d, e, a, b);
+	step40_59(49, b, c, d, e, a);
+	step40_59(50, a, b, c, d, e);
+	step40_59(51, e, a, b, c, d);
+	step40_59(52, d, e, a, b, c);
+	step40_59(53, c, d, e, a, b);
+	step40_59(54, b, c, d, e, a);
+	step40_59(55, a, b, c, d, e);
+	step40_59(56, e, a, b, c, d);
+	step40_59(57, d, e, a, b, c);
+	step40_59(58, c, d, e, a, b);
+	step40_59(59, b, c, d, e, a);
+
+	step60_79(60, a, b, c, d, e);
+	step60_79(61, e, a, b, c, d);
+	step60_79(62, d, e, a, b, c);
+	step60_79(63, c, d, e, a, b);
+	step60_79(64, b, c, d, e, a);
+	step60_79(65, a, b, c, d, e);
+	step60_79(66, e, a, b, c, d);
+	step60_79(67, d, e, a, b, c);
+	step60_79(68, c, d, e, a, b);
+	step60_79(69, b, c, d, e, a);
+	step60_79(70, a, b, c, d, e);
+	step60_79(71, e, a, b, c, d);
+	step60_79(72, d, e, a, b, c);
+	step60_79(73, c, d, e, a, b);
+	step60_79(74, b, c, d, e, a);
+	step60_79(75, a, b, c, d, e);
+	step60_79(76, e, a, b, c, d);
+	step60_79(77, d, e, a, b, c);
+	step60_79(78, c, d, e, a, b);
+	step60_79(79, b, c, d, e, a);
+
+	digest[0] += a;
+	digest[1] += b;
+	digest[2] += c;
+	digest[3] += d;
+	digest[4] += e;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+	static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+	    { SHA1_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+
+struct slver sha1_ctx_mgr_init_base_slver_00000192;
+struct slver sha1_ctx_mgr_init_base_slver = { 0x0192, 0x00, 0x00 };
+
+struct slver sha1_ctx_mgr_submit_base_slver_00000193;
+struct slver sha1_ctx_mgr_submit_base_slver = { 0x0193, 0x00, 0x00 };
+
+struct slver sha1_ctx_mgr_flush_base_slver_00000194;
+struct slver sha1_ctx_mgr_flush_base_slver = { 0x0194, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c
new file mode 100644
index 000000000..32eb07f6e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c
@@ -0,0 +1,54 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdint.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+
+extern void sha1_ctx_mgr_init_base(SHA1_HASH_CTX_MGR * mgr);
+extern SHA1_HASH_CTX *sha1_ctx_mgr_submit_base(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+					       const void *buffer, uint32_t len,
+					       HASH_CTX_FLAG flags);
+extern SHA1_HASH_CTX *sha1_ctx_mgr_flush_base(SHA1_HASH_CTX_MGR * mgr);
+
+void sha1_ctx_mgr_init(SHA1_HASH_CTX_MGR * mgr)
+{
+	return sha1_ctx_mgr_init_base(mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+				   const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	return sha1_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush(SHA1_HASH_CTX_MGR * mgr)
+{
+	return sha1_ctx_mgr_flush_base(mgr);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c
new file mode 100644
index 000000000..db70ee015
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c
@@ -0,0 +1,251 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_sse(SHA1_HASH_CTX_MGR * mgr)
+{
+	sha1_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_sse(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+				       const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_sse(SHA1_HASH_CTX_MGR * mgr)
+{
+	SHA1_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_sse(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+	while (ctx) {
+
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA1_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA1_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse(&mgr->mgr,
+									       &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+	static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+	    { SHA1_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+	    SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA1_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_sse_slver_00020139;
+struct slver sha1_ctx_mgr_init_sse_slver = { 0x0139, 0x02, 0x00 };
+
+struct slver sha1_ctx_mgr_submit_sse_slver_00020140;
+struct slver sha1_ctx_mgr_submit_sse_slver = { 0x0140, 0x02, 0x00 };
+
+struct slver sha1_ctx_mgr_flush_sse_slver_00020141;
+struct slver sha1_ctx_mgr_flush_sse_slver = { 0x0141, 0x02, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c
new file mode 100644
index 000000000..d3c7687d2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c
@@ -0,0 +1,259 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_SHANI
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_sse_ni(SHA1_HASH_CTX_MGR * mgr)
+{
+	// Same with sse
+	sha1_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_sse_ni(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+					  const void *buffer, uint32_t len,
+					  HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx =
+			    (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse_ni(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_sse_ni(SHA1_HASH_CTX_MGR * mgr)
+{
+	SHA1_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_sse_ni(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+	while (ctx) {
+
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA1_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA1_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse_ni(&mgr->mgr,
+										  &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx =
+			    (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse_ni(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+	static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+	    { SHA1_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+	    SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA1_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_sse_ni_slver_070002c1;
+struct slver sha1_ctx_mgr_init_sse_ni_slver = { 0x02c1, 0x00, 0x07 };
+
+struct slver sha1_ctx_mgr_submit_sse_ni_slver_070002c2;
+struct slver sha1_ctx_mgr_submit_sse_ni_slver = { 0x02c2, 0x00, 0x07 };
+
+struct slver sha1_ctx_mgr_flush_sse_ni_slver_070002c3;
+struct slver sha1_ctx_mgr_flush_sse_ni_slver = { 0x02c3, 0x00, 0x07 };
+
+#endif // HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm
new file mode 100644
index 000000000..1c9a66fd4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm
@@ -0,0 +1,67 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN             0
+%define STS_BEING_PROCESSED     1
+%define STS_COMPLETED           2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Threshold constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; if number of lanes in use <= threshold, using sb func
+%define SHA1_SB_THRESHOLD_SSE		1
+%define SHA1_SB_THRESHOLD_AVX		1
+%define SHA1_SB_THRESHOLD_AVX2		1
+%define SHA1_SB_THRESHOLD_AVX512	1
+%define SHA1_NI_SB_THRESHOLD_SSE	4 ; shani is faster than sse sha1_mb
+%define SHA1_NI_SB_THRESHOLD_AVX512	6
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA1_JOB structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; SHA1_JOB
+
+;;;     name                            size    align
+FIELD   _buffer,                        8,      8       ; pointer to buffer
+FIELD   _len,                           4,      4       ; length in bytes
+FIELD   _result_digest,                 5*4,    64      ; Digest (output)
+FIELD   _status,                        4,      4
+FIELD   _user_data,                     8,      8
+END_FIELDS
+
+%assign _SHA1_JOB_size  _FIELD_OFFSET
+%assign _SHA1_JOB_align _STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_flush_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_flush_test.c
new file mode 100644
index 000000000..4bf2e09b5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_flush_test.c
@@ -0,0 +1,146 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha1_mb.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS (SHA1_MAX_LANES - 1)
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][SHA1_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha1_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+uint8_t lens_print_and_check(SHA1_HASH_CTX_MGR * mgr)
+{
+	static int32_t last_lens[SHA1_MAX_LANES] = { 0 };
+	int32_t len;
+	uint8_t num_unchanged = 0;
+	int i;
+	for (i = 0; i < SHA1_MAX_LANES; i++) {
+		len = (int32_t) mgr->mgr.lens[i];
+		// len[i] in mgr consists of byte_length<<4 | lane_index
+		len = (len >= 16) ? (len >> 4 << 6) : 0;
+		printf("\t%d", len);
+		if (last_lens[i] > 0 && last_lens[i] == len)
+			num_unchanged += 1;
+		last_lens[i] = len;
+	}
+	printf("\n");
+	return num_unchanged;
+}
+
+int main(void)
+{
+	SHA1_HASH_CTX_MGR *mgr = NULL;
+	SHA1_HASH_CTX ctxpool[TEST_BUFS];
+	uint32_t i, j, fail = 0;
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t lens[TEST_BUFS];
+	uint8_t num_ret, num_unchanged = 0;
+	int ret;
+
+	printf("sha1_mb flush test, %d buffers with %d length: \n", TEST_BUFS, TEST_LEN);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha1_ctx_mgr_init(mgr);
+
+	srand(TEST_SEED);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocate  and fill buffer
+		lens[i] = TEST_LEN / SHA1_MAX_LANES * (i + 1);
+		bufs[i] = (unsigned char *)malloc(lens[i]);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], lens[i]);
+	}
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Init ctx contexts
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// Run reference test
+		sha1_ref(bufs[i], digest_ref[i], lens[i]);
+
+		// Run sb_sha1 test
+		sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+	}
+
+	printf("Changes of lens inside mgr:\n");
+	lens_print_and_check(mgr);
+	while (sha1_ctx_mgr_flush(mgr)) {
+		num_ret = lens_print_and_check(mgr);
+		num_unchanged = num_unchanged > num_ret ? num_unchanged : num_ret;
+	}
+	printf("Info of sha1_mb lens prints over\n");
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("Test%d fixed size, digest%d "
+				       "fail 0x%08X <=> 0x%08X \n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else if (num_unchanged)
+		printf("SHA-NI is used when %d or %d jobs are uncompleted\n",
+		       num_unchanged, num_unchanged + 1);
+	else
+		printf("SHA-NI is not used, or used for last job\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..21c81403b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm
@@ -0,0 +1,74 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA1 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; LANE_DATA
+;;;     name            size    align
+FIELD   _job_in_lane,   8,      8       ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align        _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; SHA1_ARGS_X16
+;;;     name            size    align
+FIELD   _digest,        4*5*16,  16      ; transposed digest
+FIELD   _data_ptr,      8*16,    8       ; array of pointers to data
+END_FIELDS
+
+%assign _SHA1_ARGS_X4_size      _FIELD_OFFSET
+%assign _SHA1_ARGS_X4_align     _STRUCT_ALIGN
+%assign _SHA1_ARGS_X8_size	_FIELD_OFFSET
+%assign _SHA1_ARGS_X8_align	_STRUCT_ALIGN
+%assign _SHA1_ARGS_X16_size	_FIELD_OFFSET
+%assign _SHA1_ARGS_X16_align	_STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; MB_MGR
+;;;     name            size    align
+FIELD   _args,          _SHA1_ARGS_X4_size, _SHA1_ARGS_X4_align
+FIELD   _lens,          4*16,   8
+FIELD   _unused_lanes,  8,      8
+FIELD   _ldata,         _LANE_DATA_size*16, _LANE_DATA_align
+FIELD   _num_lanes_inuse, 4,    4
+END_FIELDS
+
+%assign _MB_MGR_size    _FIELD_OFFSET
+%assign _MB_MGR_align   _STRUCT_ALIGN
+
+_args_digest    equ     _args + _digest
+_args_data_ptr  equ     _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm
new file mode 100644
index 000000000..c5fd71300
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm
@@ -0,0 +1,247 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x4_avx
+extern sha1_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be in a register not clobberred by sha1_mult
+%define idx     rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be in a register not clobberred by sha1_mult
+%define idx     rsi
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4            r8
+%define lens0           r8
+
+%define lens1           r9
+%define lens2           r10
+%define lens3           r11
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*2
+_ALIGN_SIZE     equ 8
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_avx(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha1_mb_mgr_flush_avx, function
+sha1_mb_mgr_flush_avx:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*1], rsi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	; use num_lanes_inuse to judge all lanes are empty
+	cmp	dword [state + _num_lanes_inuse], 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+	cmp     qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [one]
+	cmp     qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [two]
+	cmp     qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [three]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     DWORD(lens2), [state + _lens + 2*4]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     DWORD(lens3), [state + _lens + 3*4]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+
+	; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+	cmp	dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_AVX
+	ja	mb_processing
+
+	; lensN-len2=idx
+	shr     len2, 4
+	mov     [state + _lens + idx*4], DWORD(idx)
+	mov	r10, idx
+	or	r10, 0x1000	; avx has 4 lanes *4, r10b is idx, r10b2 is 16
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha1_opt_x1
+	; state and idx are intact
+	jmp	len_is_0
+
+mb_processing:
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     [state + _lens + 2*4], DWORD(lens2)
+	mov     [state + _lens + 3*4], DWORD(lens3)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha1_mb_x4_avx
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	vmovd    xmm0, [state + _args_digest + 4*idx + 0*16]
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
+	mov     DWORD(tmp2),  [state + _args_digest + 4*idx + 4*16]
+
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+	mov     [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*1]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+one:    dq  1
+two:    dq  2
+three:  dq  3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..a47ae2838
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm
@@ -0,0 +1,273 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x8_avx2
+extern sha1_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+%define tmp4    rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%define tmp4    rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+; idx must be a register not clobberred by sha1_mb_x8_avx2 and sha1_opt_x1
+%define idx             rbp
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*8
+_ALIGN_SIZE     equ 8
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_avx2(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha1_mb_mgr_flush_avx2, function
+sha1_mb_mgr_flush_avx2:
+	endbranch
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*3], rbp
+	mov     [rsp + _GPR_SAVE + 8*4], r12
+	mov     [rsp + _GPR_SAVE + 8*5], r13
+	mov     [rsp + _GPR_SAVE + 8*6], r14
+	mov     [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*1], rsi
+	mov     [rsp + _GPR_SAVE + 8*2], rdi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	; use num_lanes_inuse to judge all lanes are empty
+	cmp	dword [state + _num_lanes_inuse], 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor	idx, idx
+	cmp	qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [one]
+	cmp	qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [two]
+	cmp	qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [three]
+	cmp	qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [four]
+	cmp	qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [five]
+	cmp	qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [six]
+	cmp	qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [seven]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov	tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _args + _data_ptr + 8*I], tmp
+	mov 	dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	vmovdqa xmm0, [state + _lens + 0*16]
+	vmovdqa xmm1, [state + _lens + 1*16]
+
+	vpminud xmm2, xmm0, xmm1        ; xmm2 has {D,C,B,A}
+	vpalignr xmm3, xmm3, xmm2, 8    ; xmm3 has {x,x,D,C}
+	vpminud xmm2, xmm2, xmm3        ; xmm2 has {x,x,E,F}
+	vpalignr xmm3, xmm3, xmm2, 4    ; xmm3 has {x,x,x,E}
+	vpminud xmm2, xmm2, xmm3        ; xmm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+	; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+	cmp	dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_AVX2
+	ja	mb_processing
+
+	; lensN-len2=idx
+	mov     [state + _lens + idx*4], DWORD(idx)
+	mov	r10, idx
+	or	r10, 0x2000	; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha1_opt_x1
+	; state and idx are intact
+	jmp	len_is_0
+
+mb_processing:
+
+	vpand   xmm2, xmm2, [rel clear_low_nibble]
+	vpshufd xmm2, xmm2, 0
+
+	vpsubd  xmm0, xmm0, xmm2
+	vpsubd  xmm1, xmm1, xmm2
+
+	vmovdqa [state + _lens + 0*16], xmm0
+	vmovdqa [state + _lens + 1*16], xmm1
+
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha1_mb_x8_avx2
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	mov	unused_lanes, [state + _unused_lanes]
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*32]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*32], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*32], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*32], 3
+	mov	DWORD(tmp2),  [state + _args_digest + 4*idx + 4*32]
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+	mov	[job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*1]
+	mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     rbp, [rsp + _GPR_SAVE + 8*3]
+	mov     r12, [rsp + _GPR_SAVE + 8*4]
+	mov     r13, [rsp + _GPR_SAVE + 8*5]
+	mov     r14, [rsp + _GPR_SAVE + 8*6]
+	mov     r15, [rsp + _GPR_SAVE + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+one:	dq  1
+two:	dq  2
+three:	dq  3
+four:	dq  4
+five:	dq  5
+six:	dq  6
+seven:	dq  7
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..5e3db5b9b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm
@@ -0,0 +1,271 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sha1_mb_x16_avx512
+extern sha1_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%else
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common definitions and latter-state(unused,covered,unchanged)
+%define state   arg1	; unchanged
+%define job     arg2	; unused
+%define len2    arg2	; unused
+
+; idx must be a register not clobberred by sha1_mb_x16_avx512
+%define idx             rbp	; unchanged
+
+%define unused_lanes    rbx	; covered
+%define lane_data       rbx	; covered
+%define tmp2            rbx	; covered
+
+%define num_lanes_inuse r9	; covered
+
+%define job_rax         rax	; covered
+%define tmp             rax	; unused
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*8
+_ALIGN_SIZE     equ 8
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_avx512(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha1_mb_mgr_flush_avx512, function
+sha1_mb_mgr_flush_avx512:
+	endbranch
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*3], rbp
+	mov     [rsp + _GPR_SAVE + 8*4], r12
+	mov     [rsp + _GPR_SAVE + 8*5], r13
+	mov     [rsp + _GPR_SAVE + 8*6], r14
+	mov     [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*1], rsi
+	mov     [rsp + _GPR_SAVE + 8*2], rdi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	cmp	num_lanes_inuse, 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor	idx, idx
+%assign I 1
+%rep 15
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov	tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _args + _data_ptr + 8*I], tmp
+	mov 	dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	vmovdqu ymm0, [state + _lens + 0*32]
+	vmovdqu ymm1, [state + _lens + 1*32]
+
+	vpminud ymm2, ymm0, ymm1        ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+	vpalignr ymm3, ymm3, ymm2, 8    ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+	vpalignr ymm3, ymm3, ymm2, 4    ; ymm3 has {x,x, x,H2,x,x, x,D2}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x, x,G3,x,x, x,C3}
+	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has {x,x, x, x,x,x, x,C3}
+        vpminud ymm2, ymm2, ymm3        ; ymm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+	; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+	cmp	dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_AVX512
+	ja	mb_processing
+
+	; lensN-len2=idx
+	mov     [state + _lens + idx*4], DWORD(idx)
+	mov	r10, idx
+	or	r10, 0x4000	; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha1_opt_x1
+	; state and idx are intact
+	jmp	len_is_0
+
+mb_processing:
+
+	vpand   ymm2, ymm2, [rel clear_low_nibble]
+        vpshufd ymm2, ymm2, 0
+
+        vpsubd  ymm0, ymm0, ymm2
+        vpsubd  ymm1, ymm1, ymm2
+
+        vmovdqu [state + _lens + 0*32], ymm0
+        vmovdqu [state + _lens + 1*32], ymm1
+
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha1_mb_x16_avx512
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	mov	unused_lanes, [state + _unused_lanes]
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+        mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        sub     num_lanes_inuse, 1
+        mov     [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*64]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*64], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*64], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*64], 3
+	mov	DWORD(tmp2),  [state + _args_digest + 4*idx + 4*64]
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+	mov	[job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*1]
+	mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     rbp, [rsp + _GPR_SAVE + 8*3]
+	mov     r12, [rsp + _GPR_SAVE + 8*4]
+	mov     r13, [rsp + _GPR_SAVE + 8*5]
+	mov     r14, [rsp + _GPR_SAVE + 8*6]
+	mov     r15, [rsp + _GPR_SAVE + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1:     dq  1
+lane_2:     dq  2
+lane_3:     dq  3
+lane_4:     dq  4
+lane_5:     dq  5
+lane_6:     dq  6
+lane_7:     dq  7
+lane_8:     dq  8
+lane_9:     dq  9
+lane_10:    dq  10
+lane_11:    dq  11
+lane_12:    dq  12
+lane_13:    dq  13
+lane_14:    dq  14
+lane_15:    dq  15
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_mb_mgr_flush_avx512
+no_sha1_mb_mgr_flush_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm
new file mode 100644
index 000000000..4170b6c73
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm
@@ -0,0 +1,278 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ %ifdef HAVE_AS_KNOWS_SHANI
+
+extern sha1_mb_x16_avx512
+extern sha1_ni_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%else
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common definitions and latter-state(unused,covered,unchanged)
+%define state   arg1	; unchanged
+%define job     arg2	; unused
+%define len2    arg2	; unused
+
+; idx must be a register not clobberred by sha1_mb_x16_avx512
+%define idx             rbp	; unchanged
+
+%define unused_lanes    rbx	; covered
+%define lane_data       rbx	; covered
+%define tmp2            rbx	; covered
+
+%define num_lanes_inuse r9	; covered
+
+%define job_rax         rax	; covered
+%define tmp             rax	; unused
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*8
+_ALIGN_SIZE     equ 8
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_avx512(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha1_mb_mgr_flush_avx512_ni, function
+sha1_mb_mgr_flush_avx512_ni:
+	endbranch
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*3], rbp
+	mov     [rsp + _GPR_SAVE + 8*4], r12
+	mov     [rsp + _GPR_SAVE + 8*5], r13
+	mov     [rsp + _GPR_SAVE + 8*6], r14
+	mov     [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*1], rsi
+	mov     [rsp + _GPR_SAVE + 8*2], rdi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	cmp	num_lanes_inuse, 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor	idx, idx
+%assign I 1
+%rep 15
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov	tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _args + _data_ptr + 8*I], tmp
+	mov 	dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	vmovdqu ymm0, [state + _lens + 0*32]
+	vmovdqu ymm1, [state + _lens + 1*32]
+
+	vpminud ymm2, ymm0, ymm1        ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+	vpalignr ymm3, ymm3, ymm2, 8    ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+	vpalignr ymm3, ymm3, ymm2, 4    ; ymm3 has {x,x, x,H2,x,x, x,D2}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x, x,G3,x,x, x,C3}
+	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has {x,x, x, x,x,x, x,C3}
+        vpminud ymm2, ymm2, ymm3        ; ymm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+	; compare with shani-sb threshold, if num_lanes_inuse <= threshold, using shani func
+	cmp	dword [state + _num_lanes_inuse], SHA1_NI_SB_THRESHOLD_AVX512
+	ja	mb_processing
+
+	; lensN-len2=idx
+	mov     [state + _lens + idx*4], DWORD(idx)
+	mov	r10, idx
+	or	r10, 0x4000	; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha1_ni_x1
+	; state and idx are intact
+	jmp	len_is_0
+
+mb_processing:
+
+	vpand   ymm2, ymm2, [rel clear_low_nibble]
+        vpshufd ymm2, ymm2, 0
+
+        vpsubd  ymm0, ymm0, ymm2
+        vpsubd  ymm1, ymm1, ymm2
+
+        vmovdqu [state + _lens + 0*32], ymm0
+        vmovdqu [state + _lens + 1*32], ymm1
+
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha1_mb_x16_avx512
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	mov	unused_lanes, [state + _unused_lanes]
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+        mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        sub     num_lanes_inuse, 1
+        mov     [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*64]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*64], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*64], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*64], 3
+	mov	DWORD(tmp2),  [state + _args_digest + 4*idx + 4*64]
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+	mov	[job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*1]
+	mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     rbp, [rsp + _GPR_SAVE + 8*3]
+	mov     r12, [rsp + _GPR_SAVE + 8*4]
+	mov     r13, [rsp + _GPR_SAVE + 8*5]
+	mov     r14, [rsp + _GPR_SAVE + 8*6]
+	mov     r15, [rsp + _GPR_SAVE + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1:     dq  1
+lane_2:     dq  2
+lane_3:     dq  3
+lane_4:     dq  4
+lane_5:     dq  5
+lane_6:     dq  6
+lane_7:     dq  7
+lane_8:     dq  8
+lane_9:     dq  9
+lane_10:    dq  10
+lane_11:    dq  11
+lane_12:    dq  12
+lane_13:    dq  13
+lane_14:    dq  14
+lane_15:    dq  15
+
+ %else
+  %ifidn __OUTPUT_FORMAT__, win64
+   global no_sha1_mb_mgr_flush_avx512_ni
+   no_sha1_mb_mgr_flush_avx512_ni:
+  %endif
+ %endif ; HAVE_AS_KNOWS_SHANI
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+ global no_sha1_mb_mgr_flush_avx512_ni
+  no_sha1_mb_mgr_flush_avx512_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm
new file mode 100644
index 000000000..2a4c4b50a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm
@@ -0,0 +1,249 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x4_sse
+extern sha1_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than ARG1, ARG2, rax, r8-r11
+%define idx     rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than ARG1, ARG2, rax, r8-r11
+%define idx     rsi
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4            r8
+%define lens0           r8
+
+%define lens1           r9
+%define lens2           r10
+%define lens3           r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*2
+_ALIGN_SIZE     equ 8
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_sse(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha1_mb_mgr_flush_sse, function
+sha1_mb_mgr_flush_sse:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*1], rsi
+	movdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	movdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	movdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	movdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	movdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	movdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	movdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	movdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	movdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	movdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	; use num_lanes_inuse to judge all lanes are empty
+	cmp	dword [state + _num_lanes_inuse], 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+	cmp     qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [one]
+	cmp     qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [two]
+	cmp     qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [three]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     DWORD(lens2), [state + _lens + 2*4]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     DWORD(lens3), [state + _lens + 3*4]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+
+	; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+	cmp	dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_SSE
+	ja	mb_processing
+
+	; lensN-len2=idx
+	shr     len2, 4
+	mov     [state + _lens + idx*4], DWORD(idx)
+	mov	r10, idx
+	or	r10, 0x1000	; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha1_opt_x1
+	; state and idx are intact
+	jmp	len_is_0
+
+mb_processing:
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     [state + _lens + 2*4], DWORD(lens2)
+	mov     [state + _lens + 3*4], DWORD(lens3)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha1_mb_x4_sse
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	movd    xmm0, [state + _args_digest + 4*idx + 0*16]
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
+	mov     DWORD(tmp2),  [state + _args_digest + 4*idx + 4*16]
+
+	movdqa  [job_rax + _result_digest + 0*16], xmm0
+	mov     [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	movdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	movdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	movdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	movdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	movdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	movdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	movdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	movdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	movdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*1]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+one:    dq  1
+two:    dq  2
+three:  dq  3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse_ni.asm
new file mode 100644
index 000000000..ea3cffd33
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse_ni.asm
@@ -0,0 +1,256 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+extern sha1_mb_x4_sse
+extern sha1_ni_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than ARG1, ARG2, rax, r8-r11
+%define idx     rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than ARG1, ARG2, rax, r8-r11
+%define idx     rsi
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4            r8
+%define lens0           r8
+
+%define lens1           r9
+%define lens2           r10
+%define lens3           r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*2
+_ALIGN_SIZE     equ 8
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_sse_ni(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha1_mb_mgr_flush_sse_ni, function
+sha1_mb_mgr_flush_sse_ni:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*1], rsi
+	movdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	movdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	movdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	movdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	movdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	movdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	movdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	movdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	movdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	movdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	; use num_lanes_inuse to judge all lanes are empty
+	cmp	dword [state + _num_lanes_inuse], 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+	cmp     qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [one]
+	cmp     qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [two]
+	cmp     qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [three]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     DWORD(lens2), [state + _lens + 2*4]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     DWORD(lens3), [state + _lens + 3*4]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+
+	; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+	cmp	dword [state + _num_lanes_inuse], SHA1_NI_SB_THRESHOLD_SSE
+	ja	mb_processing
+
+	; lensN-len2=idx
+	shr     len2, 4
+	mov     [state + _lens + idx*4], DWORD(idx)
+	mov	r10, idx
+	or	r10, 0x1000	; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha1_ni_x1
+	; state and idx are intact
+	jmp	len_is_0
+
+mb_processing:
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     [state + _lens + 2*4], DWORD(lens2)
+	mov     [state + _lens + 3*4], DWORD(lens3)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha1_mb_x4_sse
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	movd    xmm0, [state + _args_digest + 4*idx + 0*16]
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
+	mov     DWORD(tmp2),  [state + _args_digest + 4*idx + 4*16]
+
+	movdqa  [job_rax + _result_digest + 0*16], xmm0
+	mov     [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	movdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	movdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	movdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	movdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	movdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	movdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	movdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	movdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	movdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*1]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+one:    dq  1
+two:    dq  2
+three:  dq  3
+
+%else
+ %ifidn __OUTPUT_FORMAT__, win64
+  global no_sha1_mb_mgr_flush_sse_ni
+  no_sha1_mb_mgr_flush_sse_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..b6124486a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+
+void sha1_mb_mgr_init_avx2(SHA1_MB_JOB_MGR * state)
+{
+	unsigned int j;
+	state->unused_lanes = 0xF76543210;
+	state->num_lanes_inuse = 0;
+	for (j = 0; j < SHA1_X8_LANES; j++) {
+		state->lens[j] = 0;
+		state->ldata[j].job_in_lane = 0;
+	}
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c
new file mode 100644
index 000000000..033fb3c9f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+
+void sha1_mb_mgr_init_avx512(SHA1_MB_JOB_MGR * state)
+{
+	unsigned int j;
+	state->unused_lanes = 0xfedcba9876543210;
+	state->num_lanes_inuse = 0;
+	for (j = 0; j < SHA1_MAX_LANES; j++) {
+		state->lens[j] = 0;
+		state->ldata[j].job_in_lane = 0;
+	}
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c
new file mode 100644
index 000000000..811c4a9dd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+
+void sha1_mb_mgr_init_sse(SHA1_MB_JOB_MGR * state)
+{
+	unsigned int j;
+	state->unused_lanes = 0xF3210;
+	state->num_lanes_inuse = 0;
+	for (j = 0; j < SHA1_MIN_LANES; j++) {
+		state->lens[j] = 0;
+		state->ldata[j].job_in_lane = 0;
+	}
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm
new file mode 100644
index 000000000..49c018138
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm
@@ -0,0 +1,246 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x4_avx
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be in a register not clobberred by sha1_mult
+%define last_len        rsi
+%define idx             rsi
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%else
+; LINUX register definitions
+%define arg1    rdi
+%define arg2    rsi
+
+; idx needs to be in a register not clobberred by sha1_mult
+%define last_len        rdx
+%define idx             rdx
+
+%define size_offset     rcx
+%define tmp2            rcx
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+%define lens3           rbp
+
+%define extra_blocks    r8
+%define lens0           r8
+
+%define tmp             r9
+%define lens1           r9
+
+%define lane_data       r10
+%define lens2           r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE     8*4 + 16*10 + 8
+
+; SHA1_JOB* sha1_mb_mgr_submit_avx(SHA1_MB_JOB_MGR *state, SHA1_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha1_mb_mgr_submit_avx, function
+sha1_mb_mgr_submit_avx:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + 8*0], rbx
+	mov     [rsp + 8*3], rbp
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + 8*1], rsi
+	mov     [rsp + 8*2], rdi
+	vmovdqa  [rsp + 8*4 + 16*0], xmm6
+	vmovdqa  [rsp + 8*4 + 16*1], xmm7
+	vmovdqa  [rsp + 8*4 + 16*2], xmm8
+	vmovdqa  [rsp + 8*4 + 16*3], xmm9
+	vmovdqa  [rsp + 8*4 + 16*4], xmm10
+	vmovdqa  [rsp + 8*4 + 16*5], xmm11
+	vmovdqa  [rsp + 8*4 + 16*6], xmm12
+	vmovdqa  [rsp + 8*4 + 16*7], xmm13
+	vmovdqa  [rsp + 8*4 + 16*8], xmm14
+	vmovdqa  [rsp + 8*4 + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	movzx   lane, BYTE(unused_lanes)
+	and     lane, 0xF
+	shr     unused_lanes, 4
+	imul    lane_data, lane, _LANE_DATA_size
+	mov     dword [job + _status], STS_BEING_PROCESSED
+	lea     lane_data, [state + _ldata + lane_data]
+	mov     [state + _unused_lanes], unused_lanes
+	mov     DWORD(len), [job + _len]
+
+	shl	len, 4
+	or	len, lane
+
+	mov     [lane_data + _job_in_lane], job
+	mov     [state + _lens + 4*lane], DWORD(len)
+
+	; Load digest words from result_digest
+	vmovdqu	xmm0, [job + _result_digest + 0*16]
+	mov	DWORD(tmp), [job + _result_digest + 1*16]
+	vmovd   [state + _args_digest + 4*lane + 0*16], xmm0
+	vpextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1
+	vpextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2
+	vpextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3
+	mov     [state + _args_digest + 4*lane + 4*16], DWORD(tmp)
+
+	mov     p, [job + _buffer]
+	mov     [state + _args_data_ptr + 8*lane], p
+
+	add	dword [state + _num_lanes_inuse], 1
+	cmp     unused_lanes, 0xF
+	jne     return_null
+
+start_loop:
+	; Find min length
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     DWORD(lens2), [state + _lens + 2*4]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     DWORD(lens3), [state + _lens + 3*4]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     [state + _lens + 2*4], DWORD(lens2)
+	mov     [state + _lens + 3*4], DWORD(lens3)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha1_mb_x4_avx
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     unused_lanes, [state + _unused_lanes]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	vmovd    xmm0, [state + _args_digest + 4*idx + 0*16]
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
+	mov     DWORD(tmp),  [state + _args_digest + 4*idx + 4*16]
+
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+	mov     [job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + 8*4 + 16*0]
+	vmovdqa  xmm7, [rsp + 8*4 + 16*1]
+	vmovdqa  xmm8, [rsp + 8*4 + 16*2]
+	vmovdqa  xmm9, [rsp + 8*4 + 16*3]
+	vmovdqa  xmm10, [rsp + 8*4 + 16*4]
+	vmovdqa  xmm11, [rsp + 8*4 + 16*5]
+	vmovdqa  xmm12, [rsp + 8*4 + 16*6]
+	vmovdqa  xmm13, [rsp + 8*4 + 16*7]
+	vmovdqa  xmm14, [rsp + 8*4 + 16*8]
+	vmovdqa  xmm15, [rsp + 8*4 + 16*9]
+	mov     rsi, [rsp + 8*1]
+	mov     rdi, [rsp + 8*2]
+%endif
+	mov     rbx, [rsp + 8*0]
+	mov     rbp, [rsp + 8*3]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+
+section .data align=16
+
+align 16
+H0:     dd  0x67452301
+H1:     dd  0xefcdab89
+H2:     dd  0x98badcfe
+H3:     dd  0x10325476
+H4:     dd  0xc3d2e1f0
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..95b4f1715
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm
@@ -0,0 +1,250 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "memcpy.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x8_avx2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%define extra_blocks    rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%define extra_blocks    rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+; idx must be a register not clobberred by sha1_x8_avx2
+%define idx             r8
+%define last_len        r8
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+
+%define tmp             r9
+
+%define lane_data       r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE	8*8 + 16*10 + 8
+
+; JOB* sha1_mb_mgr_submit_avx2(MB_MGR *state, JOB_SHA1 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha1_mb_mgr_submit_avx2, function
+sha1_mb_mgr_submit_avx2:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + 8*0], rbx
+	mov     [rsp + 8*3], rbp
+	mov     [rsp + 8*4], r12
+	mov     [rsp + 8*5], r13
+	mov     [rsp + 8*6], r14
+	mov     [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + 8*1], rsi
+	mov     [rsp + 8*2], rdi
+	vmovdqa  [rsp + 8*8 + 16*0], xmm6
+	vmovdqa  [rsp + 8*8 + 16*1], xmm7
+	vmovdqa  [rsp + 8*8 + 16*2], xmm8
+	vmovdqa  [rsp + 8*8 + 16*3], xmm9
+	vmovdqa  [rsp + 8*8 + 16*4], xmm10
+	vmovdqa  [rsp + 8*8 + 16*5], xmm11
+	vmovdqa  [rsp + 8*8 + 16*6], xmm12
+	vmovdqa  [rsp + 8*8 + 16*7], xmm13
+	vmovdqa  [rsp + 8*8 + 16*8], xmm14
+	vmovdqa  [rsp + 8*8 + 16*9], xmm15
+%endif
+
+	mov	unused_lanes, [state + _unused_lanes]
+	mov	lane, unused_lanes
+	and	lane, 0xF
+	shr	unused_lanes, 4
+	imul	lane_data, lane, _LANE_DATA_size
+	mov	dword [job + _status], STS_BEING_PROCESSED
+	lea	lane_data, [state + _ldata + lane_data]
+	mov	[state + _unused_lanes], unused_lanes
+	mov	DWORD(len), [job + _len]
+
+	mov	[lane_data + _job_in_lane], job
+
+	shl	len,4
+	or	len, lane
+	mov	[state + _lens + 4*lane], DWORD(len)
+	; Load digest words from result_digest
+	vmovdqu	xmm0, [job + _result_digest + 0*16]
+	mov	DWORD(tmp), [job + _result_digest + 1*16]
+
+	vmovd   [state + _args_digest + 4*lane + 0*32], xmm0
+	vpextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1
+	vpextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2
+	vpextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3
+	mov     [state + _args_digest + 4*lane + 4*32], DWORD(tmp)
+
+	mov	p, [job + _buffer]
+	mov	[state + _args_data_ptr + 8*lane], p
+
+	add	dword [state + _num_lanes_inuse], 1
+	cmp	unused_lanes, 0xf
+	jne	return_null
+
+start_loop:
+	; Find min length
+	vmovdqa xmm0, [state + _lens + 0*16]
+	vmovdqa xmm1, [state + _lens + 1*16]
+
+	vpminud xmm2, xmm0, xmm1        ; xmm2 has {D,C,B,A}
+	vpalignr xmm3, xmm3, xmm2, 8    ; xmm3 has {x,x,D,C}
+	vpminud xmm2, xmm2, xmm3        ; xmm2 has {x,x,E,F}
+	vpalignr xmm3, xmm3, xmm2, 4    ; xmm3 has {x,x,x,E}
+	vpminud xmm2, xmm2, xmm3        ; xmm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+	vpand   xmm2, xmm2, [rel clear_low_nibble]
+	vpshufd xmm2, xmm2, 0
+
+	vpsubd  xmm0, xmm0, xmm2
+	vpsubd  xmm1, xmm1, xmm2
+
+	vmovdqa [state + _lens + 0*16], xmm0
+	vmovdqa [state + _lens + 1*16], xmm1
+
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha1_mb_x8_avx2
+
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	unused_lanes, [state + _unused_lanes]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*32]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*32], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*32], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*32], 3
+	mov	DWORD(tmp),  [state + _args_digest + 4*idx + 4*32]
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+	mov	[job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + 8*8 + 16*0]
+	vmovdqa  xmm7, [rsp + 8*8 + 16*1]
+	vmovdqa  xmm8, [rsp + 8*8 + 16*2]
+	vmovdqa  xmm9, [rsp + 8*8 + 16*3]
+	vmovdqa  xmm10, [rsp + 8*8 + 16*4]
+	vmovdqa  xmm11, [rsp + 8*8 + 16*5]
+	vmovdqa  xmm12, [rsp + 8*8 + 16*6]
+	vmovdqa  xmm13, [rsp + 8*8 + 16*7]
+	vmovdqa  xmm14, [rsp + 8*8 + 16*8]
+	vmovdqa  xmm15, [rsp + 8*8 + 16*9]
+	mov     rsi, [rsp + 8*1]
+	mov     rdi, [rsp + 8*2]
+%endif
+	mov     rbx, [rsp + 8*0]
+	mov     rbp, [rsp + 8*3]
+	mov     r12, [rsp + 8*4]
+	mov     r13, [rsp + 8*5]
+	mov     r14, [rsp + 8*6]
+	mov     r15, [rsp + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..a4f9389a1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm
@@ -0,0 +1,248 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "memcpy.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sha1_mb_x16_avx512
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%else
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common definitions and latter-state(unused,covered,unchanged)
+%define state   arg1	; unchanged, mb_x16's input1
+%define job     arg2	; arg2 unused
+%define len2    arg2	; arg2 unused, mb_x16's input2
+
+; idx must be a register not clobberred by sha1_x16_avx512
+%define idx	r8	; unchanged
+
+%define p	r11	; unused
+
+%define unused_lanes    rbx	; covered
+
+%define job_rax         rax	; covered
+%define len             rax	; unused
+
+%define lane            rbp	; unused
+
+%define tmp             r9	; covered
+%define num_lanes_inuse r9	; covered
+
+%define lane_data       r10	; covered
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE	8*8 + 16*10 + 8
+
+; JOB* sha1_mb_mgr_submit_avx512(MB_MGR *state, JOB_SHA1 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha1_mb_mgr_submit_avx512, function
+sha1_mb_mgr_submit_avx512:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + 8*0], rbx
+	mov     [rsp + 8*3], rbp
+	mov     [rsp + 8*4], r12
+	mov     [rsp + 8*5], r13
+	mov     [rsp + 8*6], r14
+	mov     [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + 8*1], rsi
+	mov     [rsp + 8*2], rdi
+	vmovdqa  [rsp + 8*8 + 16*0], xmm6
+	vmovdqa  [rsp + 8*8 + 16*1], xmm7
+	vmovdqa  [rsp + 8*8 + 16*2], xmm8
+	vmovdqa  [rsp + 8*8 + 16*3], xmm9
+	vmovdqa  [rsp + 8*8 + 16*4], xmm10
+	vmovdqa  [rsp + 8*8 + 16*5], xmm11
+	vmovdqa  [rsp + 8*8 + 16*6], xmm12
+	vmovdqa  [rsp + 8*8 + 16*7], xmm13
+	vmovdqa  [rsp + 8*8 + 16*8], xmm14
+	vmovdqa  [rsp + 8*8 + 16*9], xmm15
+%endif
+
+	mov	unused_lanes, [state + _unused_lanes]
+	mov	lane, unused_lanes
+	and	lane, 0xF
+	shr	unused_lanes, 4
+	imul	lane_data, lane, _LANE_DATA_size
+	mov	dword [job + _status], STS_BEING_PROCESSED
+	lea	lane_data, [state + _ldata + lane_data]
+	mov	[state + _unused_lanes], unused_lanes
+	mov	DWORD(len), [job + _len]
+
+	mov	[lane_data + _job_in_lane], job
+
+	shl	len,4
+	or	len, lane
+	mov	[state + _lens + 4*lane], DWORD(len)
+	; Load digest words from result_digest
+	vmovdqu	xmm0, [job + _result_digest + 0*16]
+	mov	DWORD(tmp), [job + _result_digest + 1*16]
+
+	vmovd   [state + _args_digest + 4*lane + 0*64], xmm0
+	vpextrd [state + _args_digest + 4*lane + 1*64], xmm0, 1
+	vpextrd [state + _args_digest + 4*lane + 2*64], xmm0, 2
+	vpextrd [state + _args_digest + 4*lane + 3*64], xmm0, 3
+	mov     [state + _args_digest + 4*lane + 4*64], DWORD(tmp)
+
+	mov	p, [job + _buffer]
+	mov	[state + _args_data_ptr + 8*lane], p
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        add     num_lanes_inuse, 1
+	mov	[state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+        cmp     num_lanes_inuse, 16
+	jne	return_null
+
+start_loop:
+	; Find min length, ymm0 holds ahead 8, ymm1 holds rear 8
+	vmovdqu ymm0, [state + _lens + 0*32]
+	vmovdqu ymm1, [state + _lens + 1*32]
+
+	vpminud ymm2, ymm0, ymm1        ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+	vpalignr ymm3, ymm3, ymm2, 8    ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+	vpalignr ymm3, ymm3, ymm2, 4    ; ymm3 has {x,x, x,H2,x,x, x,D2}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x, x,G3,x,x, x,C3}
+	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has {x,x, x, x,x,x, x,C3}
+        vpminud ymm2, ymm2, ymm3        ; ymm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0xF		; idx represent min length index
+	shr	len2, 4			; size in blocks
+	jz	len_is_0
+
+        vpand   ymm2, ymm2, [rel clear_low_nibble]
+        vpshufd ymm2, ymm2, 0
+
+        vpsubd  ymm0, ymm0, ymm2
+        vpsubd  ymm1, ymm1, ymm2
+
+        vmovdqu [state + _lens + 0*32], ymm0
+        vmovdqu [state + _lens + 1*32], ymm1
+
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha1_mb_x16_avx512
+
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	unused_lanes, [state + _unused_lanes]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+        mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        sub     num_lanes_inuse, 1
+        mov     [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*64]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*64], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*64], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*64], 3
+	mov	DWORD(tmp),  [state + _args_digest + 4*idx + 4*64]
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+	mov	[job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + 8*8 + 16*0]
+	vmovdqa  xmm7, [rsp + 8*8 + 16*1]
+	vmovdqa  xmm8, [rsp + 8*8 + 16*2]
+	vmovdqa  xmm9, [rsp + 8*8 + 16*3]
+	vmovdqa  xmm10, [rsp + 8*8 + 16*4]
+	vmovdqa  xmm11, [rsp + 8*8 + 16*5]
+	vmovdqa  xmm12, [rsp + 8*8 + 16*6]
+	vmovdqa  xmm13, [rsp + 8*8 + 16*7]
+	vmovdqa  xmm14, [rsp + 8*8 + 16*8]
+	vmovdqa  xmm15, [rsp + 8*8 + 16*9]
+	mov     rsi, [rsp + 8*1]
+	mov     rdi, [rsp + 8*2]
+%endif
+	mov     rbx, [rsp + 8*0]
+	mov     rbp, [rsp + 8*3]
+	mov     r12, [rsp + 8*4]
+	mov     r13, [rsp + 8*5]
+	mov     r14, [rsp + 8*6]
+	mov     r15, [rsp + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+
+section .data align=32
+
+align 32
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_mb_mgr_submit_avx512
+no_sha1_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm
new file mode 100644
index 000000000..9989a9a1d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm
@@ -0,0 +1,246 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x4_sse
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than ARG2, rax, r8-r11
+%define last_len        rsi
+%define idx             rsi
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%else
+; LINUX register definitions
+%define arg1    rdi
+%define arg2    rsi
+
+; idx needs to be other than ARG2, rax, r8-r11
+%define last_len        rdx
+%define idx             rdx
+
+%define size_offset     rcx
+%define tmp2            rcx
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+%define lens3           rbp
+
+%define extra_blocks    r8
+%define lens0           r8
+
+%define tmp             r9
+%define lens1           r9
+
+%define lane_data       r10
+%define lens2           r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE     8*4 + 16*10 + 8
+
+; SHA1_JOB* sha1_mb_mgr_submit_sse(SHA1_MB_JOB_MGR *state, SHA1_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha1_mb_mgr_submit_sse, function
+sha1_mb_mgr_submit_sse:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + 8*0], rbx
+	mov     [rsp + 8*3], rbp
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + 8*1], rsi
+	mov     [rsp + 8*2], rdi
+	movdqa  [rsp + 8*4 + 16*0], xmm6
+	movdqa  [rsp + 8*4 + 16*1], xmm7
+	movdqa  [rsp + 8*4 + 16*2], xmm8
+	movdqa  [rsp + 8*4 + 16*3], xmm9
+	movdqa  [rsp + 8*4 + 16*4], xmm10
+	movdqa  [rsp + 8*4 + 16*5], xmm11
+	movdqa  [rsp + 8*4 + 16*6], xmm12
+	movdqa  [rsp + 8*4 + 16*7], xmm13
+	movdqa  [rsp + 8*4 + 16*8], xmm14
+	movdqa  [rsp + 8*4 + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	movzx   lane, BYTE(unused_lanes)
+	and     lane, 0xF
+	shr     unused_lanes, 4
+	imul    lane_data, lane, _LANE_DATA_size
+	mov     dword [job + _status], STS_BEING_PROCESSED
+	lea     lane_data, [state + _ldata + lane_data]
+	mov     [state + _unused_lanes], unused_lanes
+	mov     DWORD(len), [job + _len]
+
+	shl	len, 4
+	or	len, lane
+
+	mov     [lane_data + _job_in_lane], job
+	mov     [state + _lens + 4*lane], DWORD(len)
+
+	; Load digest words from result_digest
+	movdqa	xmm0, [job + _result_digest + 0*16]
+	mov	DWORD(tmp), [job + _result_digest + 1*16]
+	movd    [state + _args_digest + 4*lane + 0*16], xmm0
+	pextrd  [state + _args_digest + 4*lane + 1*16], xmm0, 1
+	pextrd  [state + _args_digest + 4*lane + 2*16], xmm0, 2
+	pextrd  [state + _args_digest + 4*lane + 3*16], xmm0, 3
+	mov     [state + _args_digest + 4*lane + 4*16], DWORD(tmp)
+
+	mov     p, [job + _buffer]
+	mov     [state + _args_data_ptr + 8*lane], p
+
+	add	dword [state + _num_lanes_inuse], 1
+	cmp     unused_lanes, 0xF
+	jne     return_null
+
+start_loop:
+	; Find min length
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     DWORD(lens2), [state + _lens + 2*4]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     DWORD(lens3), [state + _lens + 3*4]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     [state + _lens + 2*4], DWORD(lens2)
+	mov     [state + _lens + 3*4], DWORD(lens3)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha1_mb_x4_sse
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     unused_lanes, [state + _unused_lanes]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	movd    xmm0, [state + _args_digest + 4*idx + 0*16]
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
+	mov     DWORD(tmp),  [state + _args_digest + 4*idx + 4*16]
+
+	movdqa  [job_rax + _result_digest + 0*16], xmm0
+	mov     [job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqa  xmm6, [rsp + 8*4 + 16*0]
+	movdqa  xmm7, [rsp + 8*4 + 16*1]
+	movdqa  xmm8, [rsp + 8*4 + 16*2]
+	movdqa  xmm9, [rsp + 8*4 + 16*3]
+	movdqa  xmm10, [rsp + 8*4 + 16*4]
+	movdqa  xmm11, [rsp + 8*4 + 16*5]
+	movdqa  xmm12, [rsp + 8*4 + 16*6]
+	movdqa  xmm13, [rsp + 8*4 + 16*7]
+	movdqa  xmm14, [rsp + 8*4 + 16*8]
+	movdqa  xmm15, [rsp + 8*4 + 16*9]
+	mov     rsi, [rsp + 8*1]
+	mov     rdi, [rsp + 8*2]
+%endif
+	mov     rbx, [rsp + 8*0]
+	mov     rbp, [rsp + 8*3]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+
+section .data align=16
+
+align 16
+H0:     dd  0x67452301
+H1:     dd  0xefcdab89
+H2:     dd  0x98badcfe
+H3:     dd  0x10325476
+H4:     dd  0xc3d2e1f0
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse_ni.asm
new file mode 100644
index 000000000..979324de4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse_ni.asm
@@ -0,0 +1,290 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+extern sha1_mb_x4_sse
+extern sha1_ni_x2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than ARG2, rax, r8-r11
+%define last_len        rsi
+%define idx             rsi
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%else
+; LINUX register definitions
+%define arg1    rdi
+%define arg2    rsi
+
+; idx needs to be other than ARG2, rax, r8-r11
+%define last_len        rdx
+%define idx             rdx
+
+%define size_offset     rcx
+%define tmp2            rcx
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+%define lens3           rbp
+
+%define extra_blocks    r8
+%define lens0           r8
+
+%define tmp             r9
+%define lens1           r9
+
+%define lane_data       r10
+%define lens2           r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE     8*6 + 16*10 + 8
+
+; SHA1_JOB* sha1_mb_mgr_submit_sse_ni(SHA1_MB_JOB_MGR *state, SHA1_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha1_mb_mgr_submit_sse_ni, function
+sha1_mb_mgr_submit_sse_ni:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + 8*0], rbx
+	mov     [rsp + 8*3], rbp
+	mov     [rsp + 8*4], r12
+	mov     [rsp + 8*5], r13
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + 8*1], rsi
+	mov     [rsp + 8*2], rdi
+	movdqa  [rsp + 8*4 + 16*0], xmm6
+	movdqa  [rsp + 8*4 + 16*1], xmm7
+	movdqa  [rsp + 8*4 + 16*2], xmm8
+	movdqa  [rsp + 8*4 + 16*3], xmm9
+	movdqa  [rsp + 8*4 + 16*4], xmm10
+	movdqa  [rsp + 8*4 + 16*5], xmm11
+	movdqa  [rsp + 8*4 + 16*6], xmm12
+	movdqa  [rsp + 8*4 + 16*7], xmm13
+	movdqa  [rsp + 8*4 + 16*8], xmm14
+	movdqa  [rsp + 8*4 + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	movzx   lane, BYTE(unused_lanes)
+	and     lane, 0xF
+	shr     unused_lanes, 4
+	imul    lane_data, lane, _LANE_DATA_size
+	mov     dword [job + _status], STS_BEING_PROCESSED
+	lea     lane_data, [state + _ldata + lane_data]
+	mov     [state + _unused_lanes], unused_lanes
+	mov     DWORD(len), [job + _len]
+
+	shl     len, 4
+	or      len, lane
+
+	mov     [lane_data + _job_in_lane], job
+	mov     [state + _lens + 4*lane], DWORD(len)
+
+	; Load digest words from result_digest
+	movdqa  xmm0, [job + _result_digest + 0*16]
+	mov     DWORD(tmp), [job + _result_digest + 1*16]
+	movd    [state + _args_digest + 4*lane + 0*16], xmm0
+	pextrd  [state + _args_digest + 4*lane + 1*16], xmm0, 1
+	pextrd  [state + _args_digest + 4*lane + 2*16], xmm0, 2
+	pextrd  [state + _args_digest + 4*lane + 3*16], xmm0, 3
+	mov     [state + _args_digest + 4*lane + 4*16], DWORD(tmp)
+
+	mov     p, [job + _buffer]
+	mov     [state + _args_data_ptr + 8*lane], p
+
+	add     dword [state + _num_lanes_inuse], 1
+
+	cmp     unused_lanes, 0xF32	; we will process two jobs at the same time
+	jne 	return_null 		; wait for another sha_ni job
+
+	; compare with shani-sb threshold, if num_lanes_sse <= threshold, using shani func
+  %if SHA1_NI_SB_THRESHOLD_SSE >= 4     ; there are 4 lanes in sse mb
+  ; shani glue code
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+	; lensN-len2=idx
+	sub     lens0, len2
+	sub     lens1, len2
+
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     r10, idx
+	or      r10, 0x1000     ; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha1_ni_x2
+	; state and idx are intact
+
+  %else
+  ; original mb code
+	cmp     unused_lanes, 0xF
+	jne     return_null
+
+start_loop:
+	; Find min length
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     DWORD(lens2), [state + _lens + 2*4]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     DWORD(lens3), [state + _lens + 3*4]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     [state + _lens + 2*4], DWORD(lens2)
+	mov     [state + _lens + 3*4], DWORD(lens3)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha1_mb_x4_sse
+	; state and idx are intact
+  %endif
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     unused_lanes, [state + _unused_lanes]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	movd    xmm0, [state + _args_digest + 4*idx + 0*16]
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
+	mov     DWORD(tmp),  [state + _args_digest + 4*idx + 4*16]
+
+	movdqa  [job_rax + _result_digest + 0*16], xmm0
+	mov     [job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqa  xmm6, [rsp + 8*4 + 16*0]
+	movdqa  xmm7, [rsp + 8*4 + 16*1]
+	movdqa  xmm8, [rsp + 8*4 + 16*2]
+	movdqa  xmm9, [rsp + 8*4 + 16*3]
+	movdqa  xmm10, [rsp + 8*4 + 16*4]
+	movdqa  xmm11, [rsp + 8*4 + 16*5]
+	movdqa  xmm12, [rsp + 8*4 + 16*6]
+	movdqa  xmm13, [rsp + 8*4 + 16*7]
+	movdqa  xmm14, [rsp + 8*4 + 16*8]
+	movdqa  xmm15, [rsp + 8*4 + 16*9]
+	mov     rsi, [rsp + 8*1]
+	mov     rdi, [rsp + 8*2]
+%endif
+	mov     rbx, [rsp + 8*0]
+	mov     rbp, [rsp + 8*3]
+	mov     r12, [rsp + 8*4]
+	mov     r13, [rsp + 8*5]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+H0:     dd  0x67452301
+H1:     dd  0xefcdab89
+H2:     dd  0x98badcfe
+H3:     dd  0x10325476
+H4:     dd  0xc3d2e1f0
+
+%else
+ %ifidn __OUTPUT_FORMAT__, win64
+  global no_sha1_mb_mgr_submit_sse_ni
+  no_sha1_mb_mgr_submit_sse_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c
new file mode 100644
index 000000000..3925a6f4b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c
@@ -0,0 +1,159 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha1_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA1_DIGEST_NWORDS];
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	SHA1_HASH_CTX_MGR *mgr = NULL;
+	SHA1_HASH_CTX ctxpool[TEST_BUFS];
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t i, j, fail = 0;
+	uint32_t lens[TEST_BUFS];
+	unsigned int jobs, t;
+	int ret;
+
+	printf("multibinary_sha1 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+	srand(TEST_SEED);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha1_ctx_mgr_init(mgr);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocate and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// SSL test
+		SHA1(bufs[i], TEST_LEN, digest_ssl[i]);
+
+		// sb_sha1 test
+		sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+	}
+
+	while (sha1_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_be32(((uint32_t *) digest_ssl[i])[j])) {
+				fail++;
+				printf("Test%d, digest%d fail %08X <=> %08X\n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       to_be32(((uint32_t *) digest_ssl[i])[j]));
+			}
+		}
+	}
+	putchar('.');
+
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		sha1_ctx_mgr_init(mgr);
+
+		for (i = 0; i < jobs; i++) {
+			// Random buffer with random len and contents
+			lens[i] = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], lens[i]);
+
+			// Run SSL test
+			SHA1(bufs[i], lens[i], digest_ssl[i]);
+
+			// Run sb_sha1 test
+			sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+		}
+
+		while (sha1_ctx_mgr_flush(mgr)) ;
+
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] !=
+				    to_be32(((uint32_t *) digest_ssl[i])[j])) {
+					fail++;
+					printf("Test%d, digest%d fail %08X <=> %08X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       to_be32(((uint32_t *) digest_ssl[i])[j]));
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha1_ssl rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c
new file mode 100644
index 000000000..4eeeaba0a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c
@@ -0,0 +1,202 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha1_mb.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][SHA1_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha1_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	SHA1_HASH_CTX_MGR *mgr = NULL;
+	SHA1_HASH_CTX ctxpool[TEST_BUFS];
+	uint32_t i, j, fail = 0;
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t lens[TEST_BUFS];
+	unsigned int jobs, t;
+	uint8_t *tmp_buf;
+	int ret;
+
+	printf("multibinary_sha1 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha1_ctx_mgr_init(mgr);
+
+	srand(TEST_SEED);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocate  and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contexts
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// Run reference test
+		sha1_ref(bufs[i], digest_ref[i], TEST_LEN);
+
+		// Run sb_sha1 test
+		sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+	}
+
+	while (sha1_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("Test%d fixed size, digest%d "
+				       "fail 0x%08X <=> 0x%08X \n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+
+	if (fail) {
+		printf("Test failed function check %d\n", fail);
+		return fail;
+	}
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		sha1_ctx_mgr_init(mgr);
+
+		for (i = 0; i < jobs; i++) {
+			// Use buffer with random len and contents
+			lens[i] = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], lens[i]);
+
+			// Run reference test
+			sha1_ref(bufs[i], digest_ref[i], lens[i]);
+
+			// Run sha1_mb test
+			sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+		}
+
+		while (sha1_ctx_mgr_flush(mgr)) ;
+
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+					fail++;
+					printf("Test%d, digest%d fail "
+					       "0x%08X <=> 0x%08X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       digest_ref[i][j]);
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	// Test at the end of buffer
+	jobs = rand() % TEST_BUFS;
+	tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+	if (!tmp_buf) {
+		printf("malloc failed, end test aborted.\n");
+		return 1;
+	}
+
+	rand_buffer(tmp_buf, jobs);
+
+	sha1_ctx_mgr_init(mgr);
+
+	// Extend to the end of allocated buffer to construct jobs
+	for (i = 0; i < jobs; i++) {
+		bufs[i] = (uint8_t *) & tmp_buf[i];
+		lens[i] = jobs - i;
+
+		// Reference test
+		sha1_ref(bufs[i], digest_ref[i], lens[i]);
+
+		// sb_sha1 test
+		sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+	}
+
+	while (sha1_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < jobs; i++) {
+		for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("End test failed at offset %d - result: 0x%08X"
+				       ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+
+	putchar('.');
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha1 rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c
new file mode 100644
index 000000000..aaa52a0ff
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c
@@ -0,0 +1,297 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha1_mb.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE		13*SHA1_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS 	(TEST_LEN/(16*SHA1_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint32_t digest_ref[TEST_BUFS][SHA1_DIGEST_NWORDS];
+
+extern void sha1_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	SHA1_HASH_CTX_MGR *mgr = NULL;
+	SHA1_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+	uint32_t i, j, fail = 0;
+	int len_done, len_rem, len_rand;
+	unsigned char *bufs[TEST_BUFS];
+	unsigned char *buf_ptr[TEST_BUFS];
+	uint32_t lens[TEST_BUFS];
+	unsigned int joblen, jobs, t;
+	int ret;
+
+	printf("multibinary_sha1_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+	       TEST_LEN);
+
+	srand(TEST_SEED);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha1_ctx_mgr_init(mgr);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocte and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		buf_ptr[i] = bufs[i];
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// Run reference test
+		sha1_ref(bufs[i], digest_ref[i], TEST_LEN);
+	}
+
+	// Run sb_sha1 tests
+	for (i = 0; i < TEST_BUFS;) {
+		len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+		len_rem = TEST_LEN - len_done;
+
+		if (len_done == 0)
+			ctx = sha1_ctx_mgr_submit(mgr,
+						  &ctxpool[i],
+						  buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+		else if (len_rem <= UPDATE_SIZE)
+			ctx = sha1_ctx_mgr_submit(mgr,
+						  &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+		else
+			ctx = sha1_ctx_mgr_submit(mgr,
+						  &ctxpool[i],
+						  buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+		// Add jobs while available or finished
+		if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+			i++;
+			continue;
+		}
+		// Resubmit unfinished job
+		i = (unsigned long)(ctx->user_data);
+		buf_ptr[i] += UPDATE_SIZE;
+	}
+
+	// Start flushing finished jobs, end on last flushed
+	ctx = sha1_ctx_mgr_flush(mgr);
+	while (ctx) {
+		if (hash_ctx_complete(ctx)) {
+			debug_char('-');
+			ctx = sha1_ctx_mgr_flush(mgr);
+			continue;
+		}
+		// Resubmit unfinished job
+		i = (unsigned long)(ctx->user_data);
+		buf_ptr[i] += UPDATE_SIZE;
+
+		len_done = (int)((unsigned long)buf_ptr[i]
+				 - (unsigned long)bufs[i]);
+		len_rem = TEST_LEN - len_done;
+
+		if (len_rem <= UPDATE_SIZE)
+			ctx = sha1_ctx_mgr_submit(mgr,
+						  &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+		else
+			ctx = sha1_ctx_mgr_submit(mgr,
+						  &ctxpool[i],
+						  buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+		if (ctx == NULL)
+			ctx = sha1_ctx_mgr_flush(mgr);
+	}
+
+	// Check digests
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("Test%d fixed size, digest%d fail %8X <=> %8X",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+	putchar('.');
+
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		for (i = 0; i < jobs; i++) {
+			joblen = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], joblen);
+			lens[i] = joblen;
+			buf_ptr[i] = bufs[i];
+			sha1_ref(bufs[i], digest_ref[i], lens[i]);
+		}
+
+		sha1_ctx_mgr_init(mgr);
+
+		// Run sha1_sb jobs
+		i = 0;
+		while (i < jobs) {
+			// Submit a new job
+			len_rand = SHA1_BLOCK_SIZE +
+			    SHA1_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+			if (lens[i] > len_rand)
+				ctx = sha1_ctx_mgr_submit(mgr,
+							  &ctxpool[i],
+							  buf_ptr[i], len_rand, HASH_FIRST);
+			else
+				ctx = sha1_ctx_mgr_submit(mgr,
+							  &ctxpool[i],
+							  buf_ptr[i], lens[i], HASH_ENTIRE);
+
+			// Returned ctx could be:
+			//  - null context (we are just getting started and lanes aren't full yet), or
+			//  - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+			//  - an unfinished ctx, we will resubmit
+
+			if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+				i++;
+				continue;
+			} else {
+				// unfinished ctx returned, choose another random update length and submit either
+				// UPDATE or LAST depending on the amount of buffer remaining
+				while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+					j = (unsigned long)(ctx->user_data);	// Get index of the returned ctx
+					buf_ptr[j] = bufs[j] + ctx->total_length;
+					len_rand = (rand() % SHA1_BLOCK_SIZE)
+					    * (rand() % MAX_RAND_UPDATE_BLOCKS);
+					len_rem = lens[j] - ctx->total_length;
+
+					if (len_rem <= len_rand)	// submit the rest of the job as LAST
+						ctx = sha1_ctx_mgr_submit(mgr,
+									  &ctxpool[j],
+									  buf_ptr[j],
+									  len_rem, HASH_LAST);
+					else	// submit the random update length as UPDATE
+						ctx = sha1_ctx_mgr_submit(mgr,
+									  &ctxpool[j],
+									  buf_ptr[j],
+									  len_rand,
+									  HASH_UPDATE);
+				}	// Either continue submitting any contexts returned here as UPDATE/LAST, or
+				// go back to submitting new jobs using the index i.
+
+				i++;
+			}
+		}
+
+		// Start flushing finished jobs, end on last flushed
+		ctx = sha1_ctx_mgr_flush(mgr);
+		while (ctx) {
+			if (hash_ctx_complete(ctx)) {
+				debug_char('-');
+				ctx = sha1_ctx_mgr_flush(mgr);
+				continue;
+			}
+			// Resubmit unfinished job
+			i = (unsigned long)(ctx->user_data);
+			buf_ptr[i] = bufs[i] + ctx->total_length;	// update buffer pointer
+			len_rem = lens[i] - ctx->total_length;
+			len_rand = (rand() % SHA1_BLOCK_SIZE)
+			    * (rand() % MAX_RAND_UPDATE_BLOCKS);
+			debug_char('+');
+			if (len_rem <= len_rand)
+				ctx = sha1_ctx_mgr_submit(mgr,
+							  &ctxpool[i],
+							  buf_ptr[i], len_rem, HASH_LAST);
+			else
+				ctx = sha1_ctx_mgr_submit(mgr,
+							  &ctxpool[i],
+							  buf_ptr[i], len_rand, HASH_UPDATE);
+
+			if (ctx == NULL)
+				ctx = sha1_ctx_mgr_flush(mgr);
+		}
+
+		// Check result digest
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+					fail++;
+					printf("Test%d, digest%d fail %8X <=> %8X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       digest_ref[i][j]);
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha1_update rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c
new file mode 100644
index 000000000..6261bbf44
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c
@@ -0,0 +1,233 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "endian_helper.h"
+
+typedef uint32_t DigestSHA1[SHA1_DIGEST_NWORDS];
+
+#define MSGS 7
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+static uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq";
+static DigestSHA1 expResultDigest1 =
+    { 0x84983E44, 0x1C3BD26E, 0xBAAE4AA1, 0xF95129E5, 0xE54670F1 };
+
+static uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO";
+static DigestSHA1 expResultDigest2 =
+    { 0xB7C66452, 0x0FD122B3, 0x55D539F2, 0xA35E6FAA, 0xC2A5A11D };
+
+static uint8_t msg3[] =
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<";
+static DigestSHA1 expResultDigest3 =
+    { 0x127729B6, 0xA8B2F8A0, 0xA4DDC819, 0x08E1D8B3, 0x67CEEA55 };
+
+static uint8_t msg4[] =
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR";
+static DigestSHA1 expResultDigest4 =
+    { 0xFDDE2D00, 0xABD5B7A3, 0x699DE6F2, 0x3FF1D1AC, 0x3B872AC2 };
+
+static uint8_t msg5[] =
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?";
+static DigestSHA1 expResultDigest5 =
+    { 0xE7FCA85C, 0xA4AB3740, 0x6A180B32, 0x0B8D362C, 0x622A96E6 };
+
+static uint8_t msg6[] =
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU";
+static DigestSHA1 expResultDigest6 =
+    { 0x505B0686, 0xE1ACDF42, 0xB3588B5A, 0xB043D52C, 0x6D8C7444 };
+
+static uint8_t msg7[] = "";
+static DigestSHA1 expResultDigest7 =
+    { 0xDA39A3EE, 0x5E6B4B0D, 0x3255BFEF, 0x95601890, 0xAFD80709 };
+
+static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 };
+
+static uint32_t *expResultDigest[MSGS] = {
+	expResultDigest1, expResultDigest2, expResultDigest3,
+	expResultDigest4, expResultDigest5, expResultDigest6,
+	expResultDigest7
+};
+
+int main(void)
+{
+	SHA1_HASH_CTX_MGR *mgr = NULL;
+	SHA1_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+	uint32_t i, j, k, t, checked = 0;
+	uint32_t *good;
+	int ret;
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha1_ctx_mgr_init(mgr);
+
+	// Init contexts before first use
+	for (i = 0; i < MSGS; i++) {
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	for (i = 0; i < MSGS; i++) {
+		ctx = sha1_ctx_mgr_submit(mgr,
+					  &ctxpool[i], msgs[i],
+					  strlen((char *)msgs[i]), HASH_ENTIRE);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			good = expResultDigest[t];
+			checked++;
+			for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the submit."
+				       " Error code: %d", ctx->error);
+				return -1;
+			}
+
+		}
+	}
+
+	while (1) {
+		ctx = sha1_ctx_mgr_flush(mgr);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			good = expResultDigest[t];
+			checked++;
+			for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the submit."
+				       " Error code: %d", ctx->error);
+				return -1;
+			}
+		} else {
+			break;
+		}
+	}
+
+	// do larger test in pseudo-random order
+
+	// Init contexts before first use
+	for (i = 0; i < NUM_JOBS; i++) {
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	checked = 0;
+	for (i = 0; i < NUM_JOBS; i++) {
+		j = PSEUDO_RANDOM_NUM(i);
+		ctx = sha1_ctx_mgr_submit(mgr,
+					  &ctxpool[i],
+					  msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+			good = expResultDigest[k];
+			checked++;
+			for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the"
+				       " submit. Error code: %d", ctx->error);
+				return -1;
+			}
+
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+		}
+	}
+	while (1) {
+		ctx = sha1_ctx_mgr_flush(mgr);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+			good = expResultDigest[k];
+			checked++;
+			for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the submit."
+				       " Error code: %d", ctx->error);
+				return -1;
+			}
+		} else {
+			break;
+		}
+	}
+
+	if (checked != NUM_JOBS) {
+		printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+		return -1;
+	}
+
+	printf(" multibinary_sha1 test: Pass\n");
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..bd8e5e527
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c
@@ -0,0 +1,128 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha1_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+#  define TEST_LEN     4*1024
+#  define TEST_LOOPS   10000
+#  define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     (GT_L3_CACHE / TEST_BUFS)
+#  define TEST_LOOPS   100
+#  define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA1_DIGEST_NWORDS];
+
+int main(void)
+{
+	SHA1_HASH_CTX_MGR *mgr = NULL;
+	SHA1_HASH_CTX ctxpool[TEST_BUFS];
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t i, j, t, fail = 0;
+	struct perf start, stop;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+		if (bufs[i] == NULL) {
+			printf("calloc failed test aborted\n");
+			return 1;
+		}
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+	if (ret) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	sha1_ctx_mgr_init(mgr);
+
+	// Start OpenSSL tests
+	perf_start(&start);
+	for (t = 0; t < TEST_LOOPS; t++) {
+		for (i = 0; i < TEST_BUFS; i++)
+			SHA1(bufs[i], TEST_LEN, digest_ssl[i]);
+	}
+	perf_stop(&stop);
+
+	printf("sha1_openssl" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+	// Start mb tests
+	perf_start(&start);
+	for (t = 0; t < TEST_LOOPS; t++) {
+		for (i = 0; i < TEST_BUFS; i++)
+			sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+		while (sha1_ctx_mgr_flush(mgr)) ;
+	}
+	perf_stop(&stop);
+
+	printf("multibinary_sha1" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_be32(((uint32_t *) digest_ssl[i])[j])) {
+				fail++;
+				printf("Test%d, digest%d fail %08X <=> %08X\n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       to_be32(((uint32_t *) digest_ssl[i])[j]));
+			}
+		}
+	}
+
+	printf("Multi-buffer sha1 test complete %d buffers of %d B with "
+	       "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha1_ossl_perf: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_shortage_perf.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_shortage_perf.c
new file mode 100644
index 000000000..0b4438d53
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_shortage_perf.c
@@ -0,0 +1,132 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha1_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS SHA1_MAX_LANES
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+#  define TEST_LEN     4*1024
+#  define TEST_LOOPS   10000
+#  define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     (GT_L3_CACHE / TEST_BUFS)
+#  define TEST_LOOPS   100
+#  define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA1_DIGEST_NWORDS];
+
+int main(void)
+{
+	SHA1_HASH_CTX_MGR *mgr = NULL;
+	SHA1_HASH_CTX ctxpool[TEST_BUFS];
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t i, j, t, fail = 0;
+	uint32_t nlanes;
+	struct perf start, stop;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+		if (bufs[i] == NULL) {
+			printf("calloc failed test aborted\n");
+			return 1;
+		}
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+	if (ret) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	sha1_ctx_mgr_init(mgr);
+
+	// Start OpenSSL tests
+	perf_start(&start);
+	for (t = 0; t < TEST_LOOPS; t++) {
+		for (i = 0; i < TEST_BUFS; i++)
+			SHA1(bufs[i], TEST_LEN, digest_ssl[i]);
+	}
+	perf_stop(&stop);
+
+	printf("sha1_openssl" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+	// Start mb shortage tests
+	for (nlanes = TEST_BUFS; nlanes > 0; nlanes--) {
+		perf_start(&start);
+		for (t = 0; t < TEST_LOOPS; t++) {
+			for (i = 0; i < nlanes; i++)
+				sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN,
+						    HASH_ENTIRE);
+
+			while (sha1_ctx_mgr_flush(mgr)) ;
+		}
+		perf_stop(&stop);
+
+		printf("multibinary_sha1" TEST_TYPE_STR " with %d lanes: ", nlanes);
+		perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+		for (i = 0; i < nlanes; i++) {
+			for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] !=
+				    to_be32(((uint32_t *) digest_ssl[i])[j])) {
+					fail++;
+					printf("Test%d, digest%d fail %08X <=> %08X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       to_be32(((uint32_t *) digest_ssl[i])[j]));
+				}
+			}
+		}
+	}
+
+	printf("Multi-buffer sha1 test complete %d buffers of %d B with "
+	       "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha1_ossl_perf: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm
new file mode 100644
index 000000000..d64ffe2bd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm
@@ -0,0 +1,563 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute oct SHA1 using AVX-512
+;; outer calling routine takes care of save and restore of XMM registers
+
+;; Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; zmm0-31
+;; Windows clobbers:  rax rbx     rdx rsi rdi        r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves:         rcx             rbp r8
+;;
+;; Linux clobbers:    rax rbx rcx rdx rsi            r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves:                       rdi rbp r8
+;;
+;; clobbers zmm0-31
+
+%define APPEND(a,b) a %+ b
+
+%ifidn __OUTPUT_FORMAT__, win64
+   %define arg1 rcx	; arg0 preserved
+   %define arg2 rdx	; arg1
+   %define reg3 r8	; arg2 preserved
+   %define reg4 r9	; arg3
+   %define var1 rdi
+   %define var2 rsi
+   %define local_func_decl(func_name) global func_name
+ %else
+   %define arg1 rdi	; arg0
+   %define arg2 rsi	; arg1
+   %define var1 rdx	; arg2
+   %define var2 rcx	; arg3
+   %define local_func_decl(func_name) mk_global func_name, function, internal
+%endif
+
+%define state    arg1
+%define num_blks arg2
+
+%define	IN	(state + _data_ptr)
+%define DIGEST	state
+%define SIZE	num_blks
+
+%define IDX  var1
+
+%define A	zmm0
+%define B	zmm1
+%define C	zmm2
+%define D	zmm3
+%define E	zmm4
+%define KT  	zmm5
+%define AA	zmm6
+%define BB	zmm7
+%define CC	zmm8
+%define DD	zmm9
+%define EE	zmm10
+%define TMP0	zmm11
+%define TMP1	zmm12
+%define TMP2	zmm13
+
+%define W0	zmm16
+%define W1	zmm17
+%define W2	zmm18
+%define W3	zmm19
+%define W4	zmm20
+%define W5	zmm21
+%define W6	zmm22
+%define W7	zmm23
+%define W8	zmm24
+%define W9	zmm25
+%define W10	zmm26
+%define W11	zmm27
+%define W12	zmm28
+%define W13	zmm29
+%define W14	zmm30
+%define W15	zmm31
+
+%define inp0	r9
+%define inp1	r10
+%define inp2	r11
+%define inp3	r12
+%define inp4	r13
+%define inp5	r14
+%define inp6	r15
+%define inp7	rax
+
+%macro TRANSPOSE16 18
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%t0 %17
+%define %%t1 %18
+
+; r0  = {a15 a14 a13 a12   a11 a10 a9 a8   a7 a6 a5 a4   a3 a2 a1 a0}
+; r1  = {b15 b14 b13 b12   b11 b10 b9 b8   b7 b6 b5 b4   b3 b2 b1 b0}
+; r2  = {c15 c14 c13 c12   c11 c10 c9 c8   c7 c6 c5 c4   c3 c2 c1 c0}
+; r3  = {d15 d14 d13 d12   d11 d10 d9 d8   d7 d6 d5 d4   d3 d2 d1 d0}
+; r4  = {e15 e14 e13 e12   e11 e10 e9 e8   e7 e6 e5 e4   e3 e2 e1 e0}
+; r5  = {f15 f14 f13 f12   f11 f10 f9 f8   f7 f6 f5 f4   f3 f2 f1 f0}
+; r6  = {g15 g14 g13 g12   g11 g10 g9 g8   g7 g6 g5 g4   g3 g2 g1 g0}
+; r7  = {h15 h14 h13 h12   h11 h10 h9 h8   h7 h6 h5 h4   h3 h2 h1 h0}
+; r8  = {i15 i14 i13 i12   i11 i10 i9 i8   i7 i6 i5 i4   i3 i2 i1 i0}
+; r9  = {j15 j14 j13 j12   j11 j10 j9 j8   j7 j6 j5 j4   j3 j2 j1 j0}
+; r10 = {k15 k14 k13 k12   k11 k10 k9 k8   k7 k6 k5 k4   k3 k2 k1 k0}
+; r11 = {l15 l14 l13 l12   l11 l10 l9 l8   l7 l6 l5 l4   l3 l2 l1 l0}
+; r12 = {m15 m14 m13 m12   m11 m10 m9 m8   m7 m6 m5 m4   m3 m2 m1 m0}
+; r13 = {n15 n14 n13 n12   n11 n10 n9 n8   n7 n6 n5 n4   n3 n2 n1 n0}
+; r14 = {o15 o14 o13 o12   o11 o10 o9 o8   o7 o6 o5 o4   o3 o2 o1 o0}
+; r15 = {p15 p14 p13 p12   p11 p10 p9 p8   p7 p6 p5 p4   p3 p2 p1 p0}
+
+; r0   = {p0  o0  n0  m0    l0  k0  j0  i0    h0  g0  f0  e0    d0  c0  b0  a0}
+; r1   = {p1  o1  n1  m1    l1  k1  j1  i1    h1  g1  f1  e1    d1  c1  b1  a1}
+; r2   = {p2  o2  n2  m2    l2  k2  j2  i2    h2  g2  f2  e2    d2  c2  b2  a2}
+; r3   = {p3  o3  n3  m3    l3  k3  j3  i3    h3  g3  f3  e3    d3  c3  b3  a3}
+; r4   = {p4  o4  n4  m4    l4  k4  j4  i4    h4  g4  f4  e4    d4  c4  b4  a4}
+; r5   = {p5  o5  n5  m5    l5  k5  j5  i5    h5  g5  f5  e5    d5  c5  b5  a5}
+; r6   = {p6  o6  n6  m6    l6  k6  j6  i6    h6  g6  f6  e6    d6  c6  b6  a6}
+; r7   = {p7  o7  n7  m7    l7  k7  j7  i7    h7  g7  f7  e7    d7  c7  b7  a7}
+; r8   = {p8  o8  n8  m8    l8  k8  j8  i8    h8  g8  f8  e8    d8  c8  b8  a8}
+; r9   = {p9  o9  n9  m9    l9  k9  j9  i9    h9  g9  f9  e9    d9  c9  b9  a9}
+; r10  = {p10 o10 n10 m10   l10 k10 j10 i10   h10 g10 f10 e10   d10 c10 b10 a10}
+; r11  = {p11 o11 n11 m11   l11 k11 j11 i11   h11 g11 f11 e11   d11 c11 b11 a11}
+; r12  = {p12 o12 n12 m12   l12 k12 j12 i12   h12 g12 f12 e12   d12 c12 b12 a12}
+; r13  = {p13 o13 n13 m13   l13 k13 j13 i13   h13 g13 f13 e13   d13 c13 b13 a13}
+; r14  = {p14 o14 n14 m14   l14 k14 j14 i14   h14 g14 f14 e14   d14 c14 b14 a14}
+; r15  = {p15 o15 n15 m15   l15 k15 j15 i15   h15 g15 f15 e15   d15 c15 b15 a15}
+
+
+	; process top half (r0..r3) {a...d}
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {b13 b12 a13 a12   b9  b8  a9  a8   b5 b4 a5 a4   b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {b15 b14 a15 a14   b11 b10 a11 a10  b7 b6 a7 a6   b3 b2 a3 a2}
+	vshufps	%%t1, %%r2, %%r3, 0x44	; t1 = {d13 d12 c13 c12   d9  d8  c9  c8   d5 d4 c5 c4   d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {d15 d14 c15 c14   d11 d10 c11 c10  d7 d6 c7 c6   d3 d2 c3 c2}
+
+	vshufps	%%r3, %%t0, %%t1, 0xDD	; r3 = {d13 c13 b13 a13   d9  c9  b9  a9   d5 c5 b5 a5   d1 c1 b1 a1}
+	vshufps	%%r1, %%r0, %%r2, 0x88	; r1 = {d14 c14 b14 a14   d10 c10 b10 a10  d6 c6 b6 a6   d2 c2 b2 a2}
+	vshufps	%%r0, %%r0, %%r2, 0xDD	; r0 = {d15 c15 b15 a15   d11 c11 b11 a11  d7 c7 b7 a7   d3 c3 b3 a3}
+	vshufps	%%t0, %%t0, %%t1, 0x88	; t0 = {d12 c12 b12 a12   d8  c8  b8  a8   d4 c4 b4 a4   d0 c0 b0 a0}
+
+	; use r2 in place of t0
+	vshufps	%%r2, %%r4, %%r5, 0x44	; r2 = {f13 f12 e13 e12   f9  f8  e9  e8   f5 f4 e5 e4   f1 f0 e1 e0}
+	vshufps	%%r4, %%r4, %%r5, 0xEE	; r4 = {f15 f14 e15 e14   f11 f10 e11 e10  f7 f6 e7 e6   f3 f2 e3 e2}
+	vshufps %%t1, %%r6, %%r7, 0x44	; t1 = {h13 h12 g13 g12   h9  h8  g9  g8   h5 h4 g5 g4   h1 h0 g1 g0}
+	vshufps	%%r6, %%r6, %%r7, 0xEE	; r6 = {h15 h14 g15 g14   h11 h10 g11 g10  h7 h6 g7 g6   h3 h2 g3 g2}
+
+	vshufps	%%r7, %%r2, %%t1, 0xDD	; r7 = {h13 g13 f13 e13   h9  g9  f9  e9   h5 g5 f5 e5   h1 g1 f1 e1}
+	vshufps	%%r5, %%r4, %%r6, 0x88	; r5 = {h14 g14 f14 e14   h10 g10 f10 e10  h6 g6 f6 e6   h2 g2 f2 e2}
+	vshufps	%%r4, %%r4, %%r6, 0xDD	; r4 = {h15 g15 f15 e15   h11 g11 f11 e11  h7 g7 f7 e7   h3 g3 f3 e3}
+	vshufps	%%r2, %%r2, %%t1, 0x88	; r2 = {h12 g12 f12 e12   h8  g8  f8  e8   h4 g4 f4 e4   h0 g0 f0 e0}
+
+	; use r6 in place of t0
+	vshufps	%%r6, %%r8, %%r9,    0x44	; r6  = {j13 j12 i13 i12   j9  j8  i9  i8   j5 j4 i5 i4   j1 j0 i1 i0}
+	vshufps	%%r8, %%r8, %%r9,    0xEE	; r8  = {j15 j14 i15 i14   j11 j10 i11 i10  j7 j6 i7 i6   j3 j2 i3 i2}
+	vshufps	%%t1, %%r10, %%r11,  0x44	; t1  = {l13 l12 k13 k12   l9  l8  k9  k8   l5 l4 k5 k4   l1 l0 k1 k0}
+	vshufps	%%r10, %%r10, %%r11, 0xEE	; r10 = {l15 l14 k15 k14   l11 l10 k11 k10  l7 l6 k7 k6   l3 l2 k3 k2}
+
+	vshufps	%%r11, %%r6, %%t1, 0xDD		; r11 = {l13 k13 j13 113   l9  k9  j9  i9   l5 k5 j5 i5   l1 k1 j1 i1}
+	vshufps	%%r9, %%r8, %%r10, 0x88		; r9  = {l14 k14 j14 114   l10 k10 j10 i10  l6 k6 j6 i6   l2 k2 j2 i2}
+	vshufps	%%r8, %%r8, %%r10, 0xDD		; r8  = {l15 k15 j15 115   l11 k11 j11 i11  l7 k7 j7 i7   l3 k3 j3 i3}
+	vshufps	%%r6, %%r6, %%t1,  0x88		; r6  = {l12 k12 j12 112   l8  k8  j8  i8   l4 k4 j4 i4   l0 k0 j0 i0}
+
+	; use r10 in place of t0
+	vshufps	%%r10, %%r12, %%r13, 0x44	; r10 = {n13 n12 m13 m12   n9  n8  m9  m8   n5 n4 m5 m4   n1 n0 a1 m0}
+	vshufps	%%r12, %%r12, %%r13, 0xEE	; r12 = {n15 n14 m15 m14   n11 n10 m11 m10  n7 n6 m7 m6   n3 n2 a3 m2}
+	vshufps	%%t1, %%r14, %%r15,  0x44	; t1  = {p13 p12 013 012   p9  p8  09  08   p5 p4 05 04   p1 p0 01 00}
+	vshufps	%%r14, %%r14, %%r15, 0xEE	; r14 = {p15 p14 015 014   p11 p10 011 010  p7 p6 07 06   p3 p2 03 02}
+
+	vshufps	%%r15, %%r10, %%t1,  0xDD	; r15 = {p13 013 n13 m13   p9  09  n9  m9   p5 05 n5 m5   p1 01 n1 m1}
+	vshufps	%%r13, %%r12, %%r14, 0x88	; r13 = {p14 014 n14 m14   p10 010 n10 m10  p6 06 n6 m6   p2 02 n2 m2}
+	vshufps	%%r12, %%r12, %%r14, 0xDD	; r12 = {p15 015 n15 m15   p11 011 n11 m11  p7 07 n7 m7   p3 03 n3 m3}
+	vshufps	%%r10, %%r10, %%t1,  0x88	; r10 = {p12 012 n12 m12   p8  08  n8  m8   p4 04 n4 m4   p0 00 n0 m0}
+
+;; At this point, the registers that contain interesting data are:
+;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12
+;; Can use t1 and r14 as scratch registers
+
+	vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r14, %%t0, %%r2		; r14 = {h8  g8  f8  e8   d8  c8  b8  a8   h0 g0 f0 e0	 d0 c0 b0 a0}
+	vmovdqa32 %%t1,  [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%t1,  %%t0, %%r2		; t1  = {h12 g12 f12 e12  d12 c12 b12 a12  h4 g4 f4 e4	 d4 c4 b4 a4}
+
+	vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r2, %%r3, %%r7		; r2  = {h9  g9  f9  e9   d9  c9  b9  a9   h1 g1 f1 e1	 d1 c1 b1 a1}
+	vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%t0, %%r3, %%r7		; t0  = {h13 g13 f13 e13  d13 c13 b13 a13  h5 g5 f5 e5	 d5 c5 b5 a5}
+
+	vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r3, %%r1, %%r5		; r3  = {h10 g10 f10 e10  d10 c10 b10 a10  h2 g2 f2 e2	 d2 c2 b2 a2}
+	vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r7, %%r1, %%r5		; r7  = {h14 g14 f14 e14  d14 c14 b14 a14  h6 g6 f6 e6	 d6 c6 b6 a6}
+
+	vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r1, %%r0, %%r4		; r1  = {h11 g11 f11 e11  d11 c11 b11 a11  h3 g3 f3 e3	 d3 c3 b3 a3}
+	vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r5, %%r0, %%r4		; r5  = {h15 g15 f15 e15  d15 c15 b15 a15  h7 g7 f7 e7	 d7 c7 b7 a7}
+
+	vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r0, %%r6, %%r10		; r0 = {p8  o8  n8  m8   l8  k8  j8  i8   p0 o0 n0 m0	 l0 k0 j0 i0}
+	vmovdqa32 %%r4,  [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r4, %%r6, %%r10		; r4  = {p12 o12 n12 m12  l12 k12 j12 i12  p4 o4 n4 m4	 l4 k4 j4 i4}
+
+	vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r6, %%r11, %%r15		; r6  = {p9  o9  n9  m9   l9  k9  j9  i9   p1 o1 n1 m1	 l1 k1 j1 i1}
+	vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r10, %%r11, %%r15		; r10 = {p13 o13 n13 m13  l13 k13 j13 i13  p5 o5 n5 m5	 l5 k5 j5 i5}
+
+	vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r11, %%r9, %%r13		; r11 = {p10 o10 n10 m10  l10 k10 j10 i10  p2 o2 n2 m2	 l2 k2 j2 i2}
+	vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r15, %%r9, %%r13		; r15 = {p14 o14 n14 m14  l14 k14 j14 i14  p6 o6 n6 m6	 l6 k6 j6 i6}
+
+	vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r9, %%r8, %%r12		; r9  = {p11 o11 n11 m11  l11 k11 j11 i11  p3 o3 n3 m3	 l3 k3 j3 i3}
+	vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r13, %%r8, %%r12		; r13 = {p15 o15 n15 m15  l15 k15 j15 i15  p7 o7 n7 m7	 l7 k7 j7 i7}
+
+;; At this point r8 and r12 can be used as scratch registers
+
+	vshuff64x2 %%r8, %%r14, %%r0, 0xEE 	; r8  = {p8  o8  n8  m8   l8  k8  j8  i8   h8 g8 f8 e8   d8 c8 b8 a8}
+	vshuff64x2 %%r0, %%r14, %%r0, 0x44 	; r0  = {p0  o0  n0  m0   l0  k0  j0  i0   h0 g0 f0 e0   d0 c0 b0 a0}
+
+	vshuff64x2 %%r12, %%t1, %%r4, 0xEE 	; r12 = {p12 o12 n12 m12  l12 k12 j12 i12  h12 g12 f12 e12  d12 c12 b12 a12}
+	vshuff64x2 %%r4, %%t1, %%r4, 0x44 	; r4  = {p4  o4  n4  m4   l4  k4  j4  i4   h4 g4 f4 e4   d4 c4 b4 a4}
+
+	vshuff64x2 %%r14, %%r7, %%r15, 0xEE 	; r14 = {p14 o14 n14 m14  l14 k14 j14 i14  h14 g14 f14 e14  d14 c14 b14 a14}
+	vshuff64x2 %%t1, %%r7, %%r15, 0x44 	; t1  = {p6  o6  n6  m6   l6  k6  j6  i6   h6 g6 f6 e6   d6 c6 b6 a6}
+
+	vshuff64x2 %%r15, %%r5, %%r13, 0xEE 	; r15 = {p15 o15 n15 m15  l15 k15 j15 i15  h15 g15 f15 e15  d15 c15 b15 a15}
+	vshuff64x2 %%r7, %%r5, %%r13, 0x44 	; r7  = {p7  o7  n7  m7   l7  k7  j7  i7   h7 g7 f7 e7   d7 c7 b7 a7}
+
+	vshuff64x2 %%r13, %%t0, %%r10, 0xEE 	; r13 = {p13 o13 n13 m13  l13 k13 j13 i13  h13 g13 f13 e13  d13 c13 b13 a13}
+	vshuff64x2 %%r5, %%t0, %%r10, 0x44 	; r5  = {p5  o5  n5  m5   l5  k5  j5  i5   h5 g5 f5 e5   d5 c5 b5 a5}
+
+	vshuff64x2 %%r10, %%r3, %%r11, 0xEE 	; r10 = {p10 o10 n10 m10  l10 k10 j10 i10  h10 g10 f10 e10  d10 c10 b10 a10}
+	vshuff64x2 %%t0, %%r3, %%r11, 0x44 	; t0  = {p2  o2  n2  m2   l2  k2  j2  i2   h2 g2 f2 e2   d2 c2 b2 a2}
+
+	vshuff64x2 %%r11, %%r1, %%r9, 0xEE 	; r11 = {p11 o11 n11 m11  l11 k11 j11 i11  h11 g11 f11 e11  d11 c11 b11 a11}
+	vshuff64x2 %%r3, %%r1, %%r9, 0x44 	; r3  = {p3  o3  n3  m3   l3  k3  j3  i3   h3 g3 f3 e3   d3 c3 b3 a3}
+
+	vshuff64x2 %%r9, %%r2, %%r6, 0xEE 	; r9  = {p9  o9  n9  m9   l9  k9  j9  i9   h9 g9 f9 e9   d9 c9 b9 a9}
+	vshuff64x2 %%r1, %%r2, %%r6, 0x44 	; r1  = {p1  o1  n1  m1   l1  k1  j1  i1   h1 g1 f1 e1   d1 c1 b1 a1}
+
+	vmovdqa32 %%r2, %%t0			; r2  = {p2  o2  n2  m2   l2  k2  j2  i2   h2 g2 f2 e2   d2 c2 b2 a2}
+	vmovdqa32 %%r6, %%t1			; r6  = {p6  o6  n6  m6   l6  k6  j6  i6   h6 g6 f6 e6   d6 c6 b6 a6}
+
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro PROCESS_LOOP 2
+%define %%WT		%1
+%define %%F_IMMED	%2
+
+	; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt
+	; E=D, D=C, C=ROTL_30(B), B=A, A=T
+
+	; Ft
+	;  0-19 Ch(B,C,D) = (B&C) ^ (~B&D)
+	; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D
+	; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D)
+
+	vmovdqa32	TMP1, B			; Copy B
+	vpaddd		E, E, %%WT		; E = E + Wt
+	vpternlogd	TMP1, C, D, %%F_IMMED	; TMP1 = Ft(B,C,D)
+	vpaddd		E, E, KT		; E = E + Wt + Kt
+	vprold		TMP0, A, 5		; TMP0 = ROTL_5(A)
+	vpaddd		E, E, TMP1		; E = Ft(B,C,D) + E + Kt + Wt
+	vprold		B, B, 30		; B = ROTL_30(B)
+	vpaddd		E, E, TMP0		; E = T
+
+	ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_79 4
+%define %%WT	%1
+%define %%WTp2	%2
+%define %%WTp8	%3
+%define %%WTp13	%4
+	; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16)
+	; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt)
+	vpternlogd	%%WT, %%WTp2, %%WTp8, 0x96
+	vpxord		%%WT, %%WT, %%WTp13
+	vprold		%%WT, %%WT, 1
+%endmacro
+
+; Note this is reading in a block of data for one lane
+; When all 16 are read, the data must be transposed to build msg schedule
+%macro MSG_SCHED_ROUND_00_15 2
+%define %%WT	 %1
+%define %%OFFSET %2
+	mov		inp0, [IN + (%%OFFSET*8)]
+	vmovups		%%WT, [inp0+IDX]
+%endmacro
+
+align 64
+
+; void sha1_mb_x16_avx512(SHA1_MB_ARGS_X16, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+local_func_decl(sha1_mb_x16_avx512)
+sha1_mb_x16_avx512:
+	endbranch
+
+	;; Initialize digests
+	vmovups	A, [DIGEST + 0*64]
+	vmovups	B, [DIGEST + 1*64]
+	vmovups	C, [DIGEST + 2*64]
+	vmovups	D, [DIGEST + 3*64]
+	vmovups	E, [DIGEST + 4*64]
+
+	xor IDX, IDX
+
+	;; transpose input onto stack
+	mov	inp0, [IN + 0*8]
+	mov	inp1, [IN + 1*8]
+	mov	inp2, [IN + 2*8]
+	mov	inp3, [IN + 3*8]
+	mov	inp4, [IN + 4*8]
+	mov	inp5, [IN + 5*8]
+	mov	inp6, [IN + 6*8]
+	mov	inp7, [IN + 7*8]
+
+	vmovups	W0,[inp0+IDX]
+	vmovups	W1,[inp1+IDX]
+	vmovups	W2,[inp2+IDX]
+	vmovups	W3,[inp3+IDX]
+	vmovups	W4,[inp4+IDX]
+	vmovups	W5,[inp5+IDX]
+	vmovups	W6,[inp6+IDX]
+	vmovups	W7,[inp7+IDX]
+
+	mov	inp0, [IN + 8*8]
+	mov	inp1, [IN + 9*8]
+	mov	inp2, [IN +10*8]
+	mov	inp3, [IN +11*8]
+	mov	inp4, [IN +12*8]
+	mov	inp5, [IN +13*8]
+	mov	inp6, [IN +14*8]
+	mov	inp7, [IN +15*8]
+
+	vmovups	W8, [inp0+IDX]
+	vmovups	W9, [inp1+IDX]
+	vmovups	W10,[inp2+IDX]
+	vmovups	W11,[inp3+IDX]
+	vmovups	W12,[inp4+IDX]
+	vmovups	W13,[inp5+IDX]
+	vmovups	W14,[inp6+IDX]
+	vmovups	W15,[inp7+IDX]
+
+lloop:
+	vmovdqa32	TMP2, [PSHUFFLE_BYTE_FLIP_MASK]
+
+	add	IDX, 64
+
+	TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+
+%assign I 0
+%rep 16
+       	vpshufb	APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+
+	; Save digests for later addition
+	vmovdqa32	AA, A
+	vmovdqa32	BB, B
+	vmovdqa32	CC, C
+	vmovdqa32	DD, D
+	vmovdqa32	EE, E
+
+	vmovdqa32	KT, [K00_19]
+%assign I 0xCA
+%assign J 0
+%assign K 2
+%assign L 8
+%assign M 13
+%assign N 0
+%rep 64
+	PROCESS_LOOP  APPEND(W,J),  I
+	MSG_SCHED_ROUND_16_79  APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+	%if N = 19
+		vmovdqa32	KT, [K20_39]
+		%assign I 0x96
+	%elif N = 39
+		vmovdqa32	KT, [K40_59]
+		%assign I 0xE8
+	%elif N = 59
+		vmovdqa32	KT, [K60_79]
+		%assign I 0x96
+	%endif
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%assign N (N+1)
+%endrep
+
+	; Check if this is the last block
+	sub 	SIZE, 1
+	je	lastLoop
+
+%assign I 0x96
+%assign J 0
+%rep 16
+	PROCESS_LOOP  APPEND(W,J),  I
+	MSG_SCHED_ROUND_00_15 APPEND(W,J), J
+%assign J (J+1)
+%endrep
+
+	; Add old digest
+	vpaddd		A,A,AA
+	vpaddd		B,B,BB
+	vpaddd		C,C,CC
+	vpaddd		D,D,DD
+	vpaddd		E,E,EE
+
+	jmp lloop
+
+lastLoop:
+; Need to reset argument rotation values to Round 64 values
+%xdefine TMP_ A
+%xdefine A B
+%xdefine B C
+%xdefine C D
+%xdefine D E
+%xdefine E TMP_
+
+	; Process last 16 rounds
+%assign I 0x96
+%assign J 0
+%rep 16
+	PROCESS_LOOP  APPEND(W,J), I
+%assign J (J+1)
+%endrep
+
+	; Add old digest
+	vpaddd		A,A,AA
+	vpaddd		B,B,BB
+	vpaddd		C,C,CC
+	vpaddd		D,D,DD
+	vpaddd		E,E,EE
+
+        ;; update into data pointers
+%assign I 0
+%rep 8
+        mov    inp0, [IN + (2*I)*8]
+        mov    inp1, [IN + (2*I +1)*8]
+        add    inp0, IDX
+        add    inp1, IDX
+        mov    [IN + (2*I)*8], inp0
+        mov    [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+	; Write out digest
+	; Do we need to untranspose digests???
+	vmovups	[DIGEST + 0*64], A
+	vmovups	[DIGEST + 1*64], B
+	vmovups	[DIGEST + 2*64], C
+	vmovups	[DIGEST + 3*64], D
+	vmovups	[DIGEST + 4*64], E
+
+	ret
+
+section .data
+align 64
+K00_19:			dq 0x5A8279995A827999, 0x5A8279995A827999
+			dq 0x5A8279995A827999, 0x5A8279995A827999
+			dq 0x5A8279995A827999, 0x5A8279995A827999
+			dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39:                 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+			dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+			dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+			dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59:                 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+			dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+			dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+			dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79:                 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+			dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+			dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+			dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+PSHUFFLE_TRANSPOSE16_MASK1: 	dq 0x0000000000000000
+				dq 0x0000000000000001
+				dq 0x0000000000000008
+				dq 0x0000000000000009
+				dq 0x0000000000000004
+				dq 0x0000000000000005
+				dq 0x000000000000000C
+				dq 0x000000000000000D
+
+PSHUFFLE_TRANSPOSE16_MASK2: 	dq 0x0000000000000002
+				dq 0x0000000000000003
+				dq 0x000000000000000A
+				dq 0x000000000000000B
+				dq 0x0000000000000006
+				dq 0x0000000000000007
+				dq 0x000000000000000E
+				dq 0x000000000000000F
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_mb_x16_avx512
+no_sha1_mb_x16_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm
new file mode 100644
index 000000000..eb67309da
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm
@@ -0,0 +1,416 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute quad SHA1 using AVX
+;; derived from ...\sha1_multiple\sha1_quad4.asm
+;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+	vshufps %%t0, %%r0, %%r1, 0x44  ; t0 = {b1 b0 a1 a0}
+	vshufps %%r0, %%r0, %%r1, 0xEE  ; r0 = {b3 b2 a3 a2}
+
+	vshufps %%t1, %%r2, %%r3, 0x44  ; t1 = {d1 d0 c1 c0}
+	vshufps %%r2, %%r2, %%r3, 0xEE  ; r2 = {d3 d2 c3 c2}
+
+	vshufps %%r1, %%t0, %%t1, 0xDD  ; r1 = {d1 c1 b1 a1}
+
+	vshufps %%r3, %%r0, %%r2, 0xDD  ; r3 = {d3 c3 b3 a3}
+
+	vshufps %%r0, %%r0, %%r2, 0x88  ; r0 = {d2 c2 b2 a2}
+	vshufps %%t0, %%t0, %%t1, 0x88  ; t0 = {d0 c0 b0 a0}
+%endmacro
+;;
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T   ;; F = ((B & C) | ((~ B) & D) )
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    vpand  %%regF, %%regB,%%regC
+    vpandn %%regT, %%regB,%%regD
+    vpor   %%regF, %%regT,%%regF
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T   ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    vpxor  %%regF,%%regD,%%regC
+    vpxor  %%regF,%%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T   ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    vpor   %%regF,%%regB,%%regC
+    vpand  %%regT,%%regB,%%regC
+    vpand  %%regF,%%regF,%%regD
+    vpor   %%regF,%%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T   ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpsrld  %%tmp, %%reg, (32-(%%imm))
+	vpslld  %%reg, %%reg, %%imm
+	vpor    %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PROLD_nd reg, imm, tmp, src
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	vpsrld  %%tmp, %%src, (32-(%%imm))
+	vpslld  %%reg, %%src, %%imm
+	vpor    %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 10
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+	vpaddd  %%regE, %%regE,%%immCNT
+	vpaddd  %%regE, %%regE,[rsp + (%%memW * 16)]
+	PROLD_nd        %%regT,5, %%regF,%%regA
+	vpaddd  %%regE, %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD   %%regB,30, %%regT
+	vpaddd  %%regE, %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 10
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+	vpaddd  %%regE, %%regE,%%immCNT
+
+	vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 16]
+	vpxor   W16, W16, W14
+	vpxor   W16, W16, [rsp + ((%%memW -  8) & 15) * 16]
+	vpxor   W16, W16, [rsp + ((%%memW -  3) & 15) * 16]
+
+	vpsrld  %%regF, W16, (32-1)
+	vpslld  W16, W16, 1
+	vpor    %%regF, %%regF, W16
+	ROTATE_W
+
+	vmovdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF
+	vpaddd  %%regE, %%regE,%%regF
+
+	PROLD_nd        %%regT,5, %%regF, %%regA
+	vpaddd  %%regE, %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD   %%regB,30, %%regT
+	vpaddd  %%regE,%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define XMM_SAVE ((15-15)*16 + 1*8)
+%define FRAMESZ 16*16 + XMM_SAVE
+%define _XMM     FRAMESZ - XMM_SAVE
+
+%define VMOVPS  vmovups
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%define IDX  rax
+
+%define A       xmm0
+%define B       xmm1
+%define C       xmm2
+%define D       xmm3
+%define E       xmm4
+%define F       xmm5 ; tmp
+%define G       xmm6 ; tmp
+
+%define TMP     G
+%define FUN     F
+%define K       xmm7
+
+%define AA      xmm8
+%define BB      xmm9
+%define CC      xmm10
+%define DD      xmm11
+%define EE      xmm12
+
+%define T0      xmm6
+%define T1      xmm7
+%define T2      xmm8
+%define T3      xmm9
+%define T4      xmm10
+%define T5      xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14     xmm13
+%define W15     xmm14
+%define W16     xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+%define DIGEST_SIZE (4*5*4)
+
+;%ifdef LINUX
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define ARG1 rdi
+ %define ARG2 rsi
+%else
+ ; Windows
+ %define ARG1 rcx
+ %define ARG2 rdx
+%endif
+
+align 32
+
+; void sha1_mb_x4_avx(SHA1_MB_ARGS_X8 *args, uint32_t size_in_blocks);
+; arg 1 : ARG1 : pointer to args  (only 4 of the 8 lanes used)
+; arg 2 : ARG2 : size (in blocks) ;; assumed to be >= 1
+;
+; Clobbers registers: ARG2, rax, r8-r11, xmm0-xmm15
+;
+mk_global sha1_mb_x4_avx, function, internal
+sha1_mb_x4_avx:
+	endbranch
+
+	sub     rsp, FRAMESZ    ;; FRAMESZ + pushes must be odd multiple of 8
+
+	;; Initialize digests
+	vmovdqa A, [ARG1 + 0*16]
+	vmovdqa B, [ARG1 + 1*16]
+	vmovdqa C, [ARG1 + 2*16]
+	vmovdqa D, [ARG1 + 3*16]
+	vmovdqa E, [ARG1 + 4*16]
+
+	;; load input pointers
+	mov     inp0,[ARG1 + _data_ptr + 0*8]
+	mov     inp1,[ARG1 + _data_ptr + 1*8]
+	mov     inp2,[ARG1 + _data_ptr + 2*8]
+	mov     inp3,[ARG1 + _data_ptr + 3*8]
+
+	xor     IDX, IDX
+lloop:
+	vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+%assign I 0
+%rep 4
+	VMOVPS  T2,[inp0+IDX]
+	VMOVPS  T1,[inp1+IDX]
+	VMOVPS  T4,[inp2+IDX]
+	VMOVPS  T3,[inp3+IDX]
+	TRANSPOSE       T2, T1, T4, T3, T0, T5
+	vpshufb T0, T0, F
+	vmovdqa [rsp+(I*4+0)*16],T0
+	vpshufb T1, T1, F
+	vmovdqa [rsp+(I*4+1)*16],T1
+	vpshufb T2, T2, F
+	vmovdqa [rsp+(I*4+2)*16],T2
+	vpshufb T3, T3, F
+	vmovdqa [rsp+(I*4+3)*16],T3
+	add     IDX, 4*4
+%assign I (I+1)
+%endrep
+
+	; save old digests
+	vmovdqa AA, A
+	vmovdqa BB, B
+	vmovdqa CC, C
+	vmovdqa DD, D
+	vmovdqa EE, E
+
+;;
+;; perform 0-79 steps
+;;
+	vmovdqa K, [K00_19]
+;; do rounds 0...15
+%assign I 0
+%rep 16
+	SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+	vmovdqa W16, [rsp + ((16 - 16) & 15) * 16]
+	vmovdqa W15, [rsp + ((16 - 15) & 15) * 16]
+%rep 4
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 20...39
+	vmovdqa K, [K20_39]
+%rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 40...59
+	vmovdqa K, [K40_59]
+%rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 60...79
+	vmovdqa K, [K60_79]
+%rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+	vpaddd  A,A,AA
+	vpaddd  B,B,BB
+	vpaddd  C,C,CC
+	vpaddd  D,D,DD
+	vpaddd  E,E,EE
+
+	sub     ARG2, 1
+	jne     lloop
+
+	; write out digests
+	vmovdqa [ARG1 + 0*16], A
+	vmovdqa [ARG1 + 1*16], B
+	vmovdqa [ARG1 + 2*16], C
+	vmovdqa [ARG1 + 3*16], D
+	vmovdqa [ARG1 + 4*16], E
+
+	; update input pointers
+	add     inp0, IDX
+	mov     [ARG1 + _data_ptr + 0*8], inp0
+	add     inp1, IDX
+	mov     [ARG1 + _data_ptr + 1*8], inp1
+	add     inp2, IDX
+	mov     [ARG1 + _data_ptr + 2*8], inp2
+	add     inp3, IDX
+	mov     [ARG1 + _data_ptr + 3*8], inp3
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+	add     rsp, FRAMESZ
+
+	ret
+
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19:                  dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39:                  dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59:                  dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79:                  dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm
new file mode 100644
index 000000000..5677dce73
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm
@@ -0,0 +1,413 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute quad SHA1 using SSE
+;; derived from ...\sha1_multiple\sha1_quad4.asm
+;; variation of sha1_mult2.asm
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+	movaps  %%t0, %%r0              ; t0 = {a3 a2 a1 a0}
+	shufps  %%t0, %%r1, 0x44        ; t0 = {b1 b0 a1 a0}
+	shufps  %%r0, %%r1, 0xEE        ; r0 = {b3 b2 a3 a2}
+
+	movaps  %%t1, %%r2              ; t1 = {c3 c2 c1 c0}
+	shufps  %%t1, %%r3, 0x44        ; t1 = {d1 d0 c1 c0}
+	shufps  %%r2, %%r3, 0xEE        ; r2 = {d3 d2 c3 c2}
+
+	movaps  %%r1, %%t0              ; r1 = {b1 b0 a1 a0}
+	shufps  %%r1, %%t1, 0xDD        ; r1 = {d1 c1 b1 a1}
+
+	movaps  %%r3, %%r0              ; r3 = {b3 b2 a3 a2}
+	shufps  %%r3, %%r2, 0xDD        ; r3 = {d3 c3 b3 a3}
+
+	shufps  %%r0, %%r2, 0x88        ; r0 = {d2 c2 b2 a2}
+	shufps  %%t0, %%t1, 0x88        ; t0 = {d0 c0 b0 a0}
+%endmacro
+;;
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T   ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    movdqa  %%regF,%%regC
+    pxor  %%regF,%%regD
+    pand  %%regF,%%regB
+    pxor  %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T   ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    movdqa  %%regF,%%regD
+    pxor  %%regF,%%regC
+    pxor  %%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T   ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    movdqa  %%regF,%%regB
+    movdqa  %%regT,%%regB
+    por   %%regF,%%regC
+    pand  %%regT,%%regC
+    pand  %%regF,%%regD
+    por   %%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T   ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	movdqa  %%tmp, %%reg
+	pslld   %%reg, %%imm
+	psrld   %%tmp, (32-%%imm)
+	por     %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 10
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+	paddd   %%regE,%%immCNT
+	paddd   %%regE,[rsp + (%%memW * 16)]
+	movdqa  %%regT,%%regA
+	PROLD   %%regT,5, %%regF
+	paddd   %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD   %%regB,30, %%regT
+	paddd   %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 10
+%define %%regA  %1
+%define %%regB  %2
+%define %%regC  %3
+%define %%regD  %4
+%define %%regE  %5
+%define %%regT  %6
+%define %%regF  %7
+%define %%memW  %8
+%define %%immCNT %9
+%define %%MAGIC %10
+	paddd   %%regE,%%immCNT
+	movdqa  W14, [rsp + ((%%memW - 14) & 15) * 16]
+	pxor    W16, W14
+	pxor    W16, [rsp + ((%%memW -  8) & 15) * 16]
+	pxor    W16, [rsp + ((%%memW -  3) & 15) * 16]
+	movdqa  %%regF, W16
+	pslld   W16, 1
+	psrld   %%regF, (32-1)
+	por     %%regF, W16
+	ROTATE_W
+
+	movdqa  [rsp + ((%%memW - 0) & 15) * 16],%%regF
+	paddd   %%regE,%%regF
+	movdqa  %%regT,%%regA
+	PROLD   %%regT,5, %%regF
+	paddd   %%regE,%%regT
+	%%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD   %%regB,30, %%regT
+	paddd   %%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define XMM_SAVE ((15-15)*16 + 1*8)
+%define FRAMESZ 16*16 + XMM_SAVE
+%define _XMM     FRAMESZ - XMM_SAVE
+
+%define MOVPS   movups
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%define IDX  rax
+
+%define A       xmm0
+%define B       xmm1
+%define C       xmm2
+%define D       xmm3
+%define E       xmm4
+%define F       xmm5 ; tmp
+%define G       xmm6 ; tmp
+
+%define TMP     G
+%define FUN     F
+%define K       xmm7
+
+%define AA      xmm8
+%define BB      xmm9
+%define CC      xmm10
+%define DD      xmm11
+%define EE      xmm12
+
+%define T0      xmm6
+%define T1      xmm7
+%define T2      xmm8
+%define T3      xmm9
+%define T4      xmm10
+%define T5      xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14     xmm13
+%define W15     xmm14
+%define W16     xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+%define DIGEST_SIZE (4*5*4)
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define ARG1 rdi
+ %define ARG2 rsi
+%else
+ ; Windows
+ %define ARG1 rcx
+ %define ARG2 rdx
+%endif
+
+align 32
+
+; void sha1_mb_x4_sse(SHA1_MB_ARGS_X8 *args, uint32_t size_in_blocks);
+; arg 1 : ARG1 : pointer to args (only 4 of the 8 lanes used)
+; arg 2 : ARG2 : size (in blocks) ;; assumed to be >= 1
+;
+; Clobbers registers: ARG2, rax, r8-r11, xmm0-xmm15
+;
+mk_global sha1_mb_x4_sse, function, internal
+sha1_mb_x4_sse:
+	endbranch
+
+	sub     rsp, FRAMESZ    ;; FRAMESZ + pushes must be odd multiple of 8
+
+	;; Initialize digests
+	movdqa  A, [ARG1 + 0*16]
+	movdqa  B, [ARG1 + 1*16]
+	movdqa  C, [ARG1 + 2*16]
+	movdqa  D, [ARG1 + 3*16]
+	movdqa  E, [ARG1 + 4*16]
+
+	;; load input pointers
+	mov     inp0,[ARG1 + _data_ptr + 0*8]
+	mov     inp1,[ARG1 + _data_ptr + 1*8]
+	mov     inp2,[ARG1 + _data_ptr + 2*8]
+	mov     inp3,[ARG1 + _data_ptr + 3*8]
+
+	xor     IDX, IDX
+lloop:
+	movdqa  F, [PSHUFFLE_BYTE_FLIP_MASK]
+%assign I 0
+%rep 4
+	MOVPS   T2,[inp0+IDX]
+	MOVPS   T1,[inp1+IDX]
+	MOVPS   T4,[inp2+IDX]
+	MOVPS   T3,[inp3+IDX]
+	TRANSPOSE       T2, T1, T4, T3, T0, T5
+	pshufb  T0, F
+	movdqa  [rsp+(I*4+0)*16],T0
+	pshufb  T1, F
+	movdqa  [rsp+(I*4+1)*16],T1
+	pshufb  T2, F
+	movdqa  [rsp+(I*4+2)*16],T2
+	pshufb  T3, F
+	movdqa  [rsp+(I*4+3)*16],T3
+	add     IDX, 4*4
+%assign I (I+1)
+%endrep
+
+	; save old digests
+	movdqa  AA, A
+	movdqa  BB, B
+	movdqa  CC, C
+	movdqa  DD, D
+	movdqa  EE, E
+
+;;
+;; perform 0-79 steps
+;;
+	movdqa  K, [K00_19]
+;; do rounds 0...15
+%assign I 0
+%rep 16
+	SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+	movdqa  W16, [rsp + ((16 - 16) & 15) * 16]
+	movdqa  W15, [rsp + ((16 - 15) & 15) * 16]
+%rep 4
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 20...39
+	movdqa  K, [K20_39]
+%rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 40...59
+	movdqa  K, [K40_59]
+%rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 60...79
+	movdqa  K, [K60_79]
+%rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+	paddd   A,AA
+	paddd   B,BB
+	paddd   C,CC
+	paddd   D,DD
+	paddd   E,EE
+
+	sub     ARG2, 1
+	jne     lloop
+
+	; write out digests
+	movdqa  [ARG1 + 0*16], A
+	movdqa  [ARG1 + 1*16], B
+	movdqa  [ARG1 + 2*16], C
+	movdqa  [ARG1 + 3*16], D
+	movdqa  [ARG1 + 4*16], E
+
+	; update input pointers
+	add     inp0, IDX
+	mov     [ARG1 + _data_ptr + 0*8], inp0
+	add     inp1, IDX
+	mov     [ARG1 + _data_ptr + 1*8], inp1
+	add     inp2, IDX
+	mov     [ARG1 + _data_ptr + 2*8], inp2
+	add     inp3, IDX
+	mov     [ARG1 + _data_ptr + 3*8], inp3
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+	add     rsp, FRAMESZ
+
+	ret
+
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19:                  dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39:                  dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59:                  dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79:                  dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm
new file mode 100644
index 000000000..edcba6d3f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm
@@ -0,0 +1,518 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute oct SHA1 using SSE-256
+;; outer calling routine takes care of save and restore of XMM registers
+
+;; Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; ymm0-15
+;; Windows clobbers:  rax rbx     rdx rsi rdi        r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves:         rcx             rbp r8
+;;
+;; Linux clobbers:    rax rbx rcx rdx rsi            r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves:                       rdi rbp r8
+;;
+;; clobbers ymm0-15
+
+
+; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+; "transpose" data in {r0...r7} using temps {t0...t1}
+; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {a7 a6 a5 a4   a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4   b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4   c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4   d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4   e3 e2 e1 e0}
+; r5 = {f7 f6 f5 f4   f3 f2 f1 f0}
+; r6 = {g7 g6 g5 g4   g3 g2 g1 g0}
+; r7 = {h7 h6 h5 h4   h3 h2 h1 h0}
+;
+; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
+;
+%macro TRANSPOSE8 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+	; process top half (r0..r3) {a...d}
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {b5 b4 a5 a4   b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {b7 b6 a7 a6   b3 b2 a3 a2}
+	vshufps %%t1, %%r2, %%r3, 0x44	; t1 = {d5 d4 c5 c4   d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {d7 d6 c7 c6   d3 d2 c3 c2}
+	vshufps	%%r3, %%t0, %%t1, 0xDD	; r3 = {d5 c5 b5 a5   d1 c1 b1 a1}
+	vshufps	%%r1, %%r0, %%r2, 0x88	; r1 = {d6 c6 b6 a6   d2 c2 b2 a2}
+	vshufps	%%r0, %%r0, %%r2, 0xDD	; r0 = {d7 c7 b7 a7   d3 c3 b3 a3}
+	vshufps	%%t0, %%t0, %%t1, 0x88	; t0 = {d4 c4 b4 a4   d0 c0 b0 a0}
+
+	; use r2 in place of t0
+	; process bottom half (r4..r7) {e...h}
+	vshufps	%%r2, %%r4, %%r5, 0x44	; r2 = {f5 f4 e5 e4   f1 f0 e1 e0}
+	vshufps	%%r4, %%r4, %%r5, 0xEE	; r4 = {f7 f6 e7 e6   f3 f2 e3 e2}
+	vshufps %%t1, %%r6, %%r7, 0x44	; t1 = {h5 h4 g5 g4   h1 h0 g1 g0}
+	vshufps	%%r6, %%r6, %%r7, 0xEE	; r6 = {h7 h6 g7 g6   h3 h2 g3 g2}
+	vshufps	%%r7, %%r2, %%t1, 0xDD	; r7 = {h5 g5 f5 e5   h1 g1 f1 e1}
+	vshufps	%%r5, %%r4, %%r6, 0x88	; r5 = {h6 g6 f6 e6   h2 g2 f2 e2}
+	vshufps	%%r4, %%r4, %%r6, 0xDD	; r4 = {h7 g7 f7 e7   h3 g3 f3 e3}
+	vshufps	%%t1, %%r2, %%t1, 0x88	; t1 = {h4 g4 f4 e4   h0 g0 f0 e0}
+
+	vperm2f128	%%r6, %%r5, %%r1, 0x13	; h6...a6
+	vperm2f128	%%r2, %%r5, %%r1, 0x02	; h2...a2
+	vperm2f128	%%r5, %%r7, %%r3, 0x13	; h5...a5
+	vperm2f128	%%r1, %%r7, %%r3, 0x02	; h1...a1
+	vperm2f128	%%r7, %%r4, %%r0, 0x13	; h7...a7
+	vperm2f128	%%r3, %%r4, %%r0, 0x02	; h3...a3
+	vperm2f128	%%r4, %%t1, %%t0, 0x13	; h4...a4
+	vperm2f128	%%r0, %%t1, %%t0, 0x02	; h0...a0
+%endmacro
+
+;;
+;; Magic functions defined in FIPS 180-1
+;;
+;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; F0 = ((B & C) | ((~B) & D))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    vpand  %%regF, %%regB,%%regC
+    vpandn %%regT, %%regB,%%regD
+    vpor   %%regF, %%regT,%%regF
+%endmacro
+
+;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    vpxor  %%regF,%%regD,%%regC
+    vpxor  %%regF,%%regF,%%regB
+%endmacro
+
+
+
+;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    vpor   %%regF,%%regB,%%regC
+    vpand  %%regT,%%regB,%%regC
+    vpand  %%regF,%%regF,%%regD
+    vpor   %%regF,%%regF,%%regT
+%endmacro
+
+;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+    MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpsrld	%%tmp, %%reg, (32-%%imm)
+	vpslld	%%reg, %%reg, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	vpsrld	%%tmp, %%src, (32-%%imm)
+	vpslld	%%reg, %%src, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 10
+%define %%regA	%1
+%define %%regB	%2
+%define %%regC	%3
+%define %%regD	%4
+%define %%regE	%5
+%define %%regT	%6
+%define %%regF	%7
+%define %%memW	%8
+%define %%immCNT %9
+%define %%MAGIC	%10
+	vpaddd	%%regE, %%regE,%%immCNT
+	vpaddd	%%regE, %%regE,[rsp + (%%memW * 32)]
+	PROLD_nd	%%regT,5, %%regF,%%regA
+	vpaddd	%%regE, %%regE,%%regT
+	%%MAGIC	%%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD	%%regB,30, %%regT
+	vpaddd	%%regE, %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 10
+%define %%regA	%1
+%define %%regB	%2
+%define %%regC	%3
+%define %%regD	%4
+%define %%regE	%5
+%define %%regT	%6
+%define %%regF	%7
+%define %%memW	%8
+%define %%immCNT %9
+%define %%MAGIC	%10
+	vpaddd	%%regE, %%regE,%%immCNT
+
+	vmovdqu	W14, [rsp + ((%%memW - 14) & 15) * 32]
+	vpxor	W16, W16, W14
+	vpxor	W16, W16, [rsp + ((%%memW -  8) & 15) * 32]
+	vpxor	W16, W16, [rsp + ((%%memW -  3) & 15) * 32]
+
+	vpsrld	%%regF, W16, (32-1)
+	vpslld	W16, W16, 1
+	vpor	%%regF, %%regF, W16
+	ROTATE_W
+
+	vmovdqu	[rsp + ((%%memW - 0) & 15) * 32],%%regF
+	vpaddd	%%regE, %%regE,%%regF
+
+	PROLD_nd	%%regT,5, %%regF, %%regA
+	vpaddd	%%regE, %%regE,%%regT
+	%%MAGIC	%%regF,%%regB,%%regC,%%regD,%%regT      ;; FUN  = MAGIC_Fi(B,C,D)
+	PROLD	%%regB,30, %%regT
+	vpaddd	%%regE,%%regE,%%regF
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define YMM_SAVE (15-15)*32
+%define FRAMESZ	32*16 + 0*8 + YMM_SAVE
+%define _YMM     FRAMESZ - YMM_SAVE
+
+%define VMOVPS	vmovups
+
+%define IDX  rax
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define inp7 rcx
+	%define arg1 rdi
+	%define arg2 rsi
+	%define RSP_SAVE rdx
+%else
+	%define inp7 rdi
+	%define arg1 rcx
+	%define arg2 rdx
+	%define RSP_SAVE rsi
+%endif
+
+
+; ymm0	A
+; ymm1	B
+; ymm2	C
+; ymm3	D
+; ymm4	E
+; ymm5		F	AA
+; ymm6		T0	BB
+; ymm7		T1	CC
+; ymm8		T2	DD
+; ymm9		T3	EE
+; ymm10		T4	TMP
+; ymm11		T5	FUN
+; ymm12		T6	K
+; ymm13		T7	W14
+; ymm14		T8	W15
+; ymm15		T9	W16
+
+%define A	ymm0
+%define B	ymm1
+%define C	ymm2
+%define D	ymm3
+%define E	ymm4
+
+%define F	ymm5
+%define T0	ymm6
+%define T1	ymm7
+%define T2	ymm8
+%define T3	ymm9
+%define T4	ymm10
+%define T5	ymm11
+%define T6	ymm12
+%define T7	ymm13
+%define T8	ymm14
+%define T9	ymm15
+
+%define AA	ymm5
+%define BB	ymm6
+%define CC	ymm7
+%define DD	ymm8
+%define EE	ymm9
+%define TMP	ymm10
+%define FUN	ymm11
+%define K	ymm12
+%define W14	ymm13
+%define W15	ymm14
+%define W16	ymm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+%define DIGEST_SIZE 	(8*5*4)	; 8 streams x 5 32bit words per digest x 4 bytes per word
+
+align 32
+
+; void sha1_x8_avx2(SHA1_MB_ARGS_X8, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+mk_global sha1_mb_x8_avx2, function, internal
+sha1_mb_x8_avx2:
+	endbranch
+
+	push	RSP_SAVE
+
+	; save rsp
+	mov	RSP_SAVE, rsp
+	sub	rsp, FRAMESZ	;; FRAMESZ + pushes must be even multiple of 8
+
+	; align rsp to 32 Bytes
+	and	rsp, ~0x1F
+
+	;; Initialize digests
+	vmovdqu	A, [arg1 + 0*32]
+	vmovdqu	B, [arg1 + 1*32]
+	vmovdqu	C, [arg1 + 2*32]
+	vmovdqu	D, [arg1 + 3*32]
+	vmovdqu	E, [arg1 + 4*32]
+
+	;; transpose input onto stack
+	mov	inp0,[arg1+_data_ptr+0*8]
+	mov	inp1,[arg1+_data_ptr+1*8]
+	mov	inp2,[arg1+_data_ptr+2*8]
+	mov	inp3,[arg1+_data_ptr+3*8]
+	mov	inp4,[arg1+_data_ptr+4*8]
+	mov	inp5,[arg1+_data_ptr+5*8]
+	mov	inp6,[arg1+_data_ptr+6*8]
+	mov	inp7,[arg1+_data_ptr+7*8]
+
+	xor	IDX, IDX
+lloop:
+	vmovdqu	F, [PSHUFFLE_BYTE_FLIP_MASK]
+%assign I 0
+%rep 2
+	VMOVPS	T0,[inp0+IDX]
+	VMOVPS	T1,[inp1+IDX]
+	VMOVPS	T2,[inp2+IDX]
+	VMOVPS	T3,[inp3+IDX]
+	VMOVPS	T4,[inp4+IDX]
+	VMOVPS	T5,[inp5+IDX]
+	VMOVPS	T6,[inp6+IDX]
+	VMOVPS	T7,[inp7+IDX]
+	TRANSPOSE8	T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
+	vpshufb	T0, T0, F
+	vmovdqu	[rsp+(I*8+0)*32],T0
+	vpshufb	T1, T1, F
+	vmovdqu	[rsp+(I*8+1)*32],T1
+	vpshufb	T2, T2, F
+	vmovdqu	[rsp+(I*8+2)*32],T2
+	vpshufb	T3, T3, F
+	vmovdqu	[rsp+(I*8+3)*32],T3
+	vpshufb	T4, T4, F
+	vmovdqu	[rsp+(I*8+4)*32],T4
+	vpshufb	T5, T5, F
+	vmovdqu	[rsp+(I*8+5)*32],T5
+	vpshufb	T6, T6, F
+	vmovdqu	[rsp+(I*8+6)*32],T6
+	vpshufb	T7, T7, F
+	vmovdqu	[rsp+(I*8+7)*32],T7
+	add	IDX, 32
+%assign I (I+1)
+%endrep
+
+
+	; save old digests
+	vmovdqu	AA, A
+	vmovdqu	BB, B
+	vmovdqu	CC, C
+	vmovdqu	DD, D
+	vmovdqu	EE, E
+
+;;
+;; perform 0-79 steps
+;;
+	vmovdqu	K, [K00_19]
+;; do rounds 0...15
+%assign I 0
+%rep 16
+	SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+	vmovdqu	W16, [rsp + ((16 - 16) & 15) * 32]
+	vmovdqu	W15, [rsp + ((16 - 15) & 15) * 32]
+%rep 4
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 20...39
+	vmovdqu	K, [K20_39]
+%rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 40...59
+	vmovdqu	K, [K40_59]
+%rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 60...79
+	vmovdqu	K, [K60_79]
+%rep 20
+	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+	ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+	vpaddd	A,A,AA
+	vpaddd	B,B,BB
+	vpaddd	C,C,CC
+	vpaddd	D,D,DD
+	vpaddd	E,E,EE
+
+	sub	arg2, 1
+	jne	lloop
+
+	; write out digests
+	vmovdqu	[arg1 + 0*32], A
+	vmovdqu	[arg1 + 1*32], B
+	vmovdqu	[arg1 + 2*32], C
+	vmovdqu	[arg1 + 3*32], D
+	vmovdqu	[arg1 + 4*32], E
+
+	;; update input pointers
+	add	inp0, IDX
+	add	inp1, IDX
+	add	inp2, IDX
+	add	inp3, IDX
+	add	inp4, IDX
+	add	inp5, IDX
+	add	inp6, IDX
+	add	inp7, IDX
+	mov	[arg1+_data_ptr+0*8], inp0
+	mov	[arg1+_data_ptr+1*8], inp1
+	mov	[arg1+_data_ptr+2*8], inp2
+	mov	[arg1+_data_ptr+3*8], inp3
+	mov	[arg1+_data_ptr+4*8], inp4
+	mov	[arg1+_data_ptr+5*8], inp5
+	mov	[arg1+_data_ptr+6*8], inp6
+	mov	[arg1+_data_ptr+7*8], inp7
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+	mov	rsp, RSP_SAVE
+
+	pop	RSP_SAVE
+	ret
+
+
+
+section .data align=32
+
+align 32
+K00_19:			dq 0x5A8279995A827999, 0x5A8279995A827999
+			dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39:                 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+			dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59:                 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+			dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79:                 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+			dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c
new file mode 100644
index 000000000..e778c5d98
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c
@@ -0,0 +1,112 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "test.h"
+
+// Test messages
+#define TST_STR "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq";
+uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO";
+uint8_t msg3[] = TST_STR TST_STR "0123456789:;<";
+uint8_t msg4[] = TST_STR TST_STR TST_STR "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR";
+uint8_t msg5[] = TST_STR TST_STR TST_STR TST_STR TST_STR "0123456789:;<=>?";
+uint8_t msg6[] =
+    TST_STR TST_STR TST_STR TST_STR TST_STR TST_STR "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU";
+uint8_t msg7[] = "";
+
+// Expected digests
+uint32_t dgst1[] = { 0x84983E44, 0x1C3BD26E, 0xBAAE4AA1, 0xF95129E5, 0xE54670F1 };
+uint32_t dgst2[] = { 0xB7C66452, 0x0FD122B3, 0x55D539F2, 0xA35E6FAA, 0xC2A5A11D };
+uint32_t dgst3[] = { 0x127729B6, 0xA8B2F8A0, 0xA4DDC819, 0x08E1D8B3, 0x67CEEA55 };
+uint32_t dgst4[] = { 0xFDDE2D00, 0xABD5B7A3, 0x699DE6F2, 0x3FF1D1AC, 0x3B872AC2 };
+uint32_t dgst5[] = { 0xE7FCA85C, 0xA4AB3740, 0x6A180B32, 0x0B8D362C, 0x622A96E6 };
+uint32_t dgst6[] = { 0x505B0686, 0xE1ACDF42, 0xB3588B5A, 0xB043D52C, 0x6D8C7444 };
+uint32_t dgst7[] = { 0xDA39A3EE, 0x5E6B4B0D, 0x3255BFEF, 0x95601890, 0xAFD80709 };
+
+uint8_t *msgs[] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 };
+uint32_t *expected_digest[] = { dgst1, dgst2, dgst3, dgst4, dgst5, dgst6, dgst7 };
+
+int check_job(uint32_t * ref, uint32_t * good, int words)
+{
+	int i;
+	for (i = 0; i < words; i++)
+		if (good[i] != ref[i])
+			return 1;
+
+	return 0;
+}
+
+#define MAX_MSGS 7
+
+int main(void)
+{
+	SHA1_HASH_CTX_MGR *mgr = NULL;
+	SHA1_HASH_CTX ctxpool[MAX_MSGS];
+	SHA1_HASH_CTX *p_job;
+	int i, checked = 0, failed = 0;
+	int n = sizeof(msgs) / sizeof(msgs[0]);
+	int ret;
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+	// Initialize multi-buffer manager
+	sha1_ctx_mgr_init(mgr);
+
+	for (i = 0; i < n; i++) {
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)expected_digest[i];
+
+		p_job = sha1_ctx_mgr_submit(mgr, &ctxpool[i], msgs[i],
+					    strlen((char *)msgs[i]), HASH_ENTIRE);
+
+		if (p_job) {	// If we have finished a job, process it
+			checked++;
+			failed +=
+			    check_job(p_job->job.result_digest, p_job->user_data,
+				      SHA1_DIGEST_NWORDS);
+		}
+	}
+
+	// Finish remaining jobs
+	while (NULL != (p_job = sha1_ctx_mgr_flush(mgr))) {
+		checked++;
+		failed +=
+		    check_job(p_job->job.result_digest, p_job->user_data, SHA1_DIGEST_NWORDS);
+	}
+
+	printf("Example multi-buffer sha1 completed=%d, failed=%d\n", checked, failed);
+	return failed;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm
new file mode 100644
index 000000000..c205f2389
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm
@@ -0,0 +1,131 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT		wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+default rel
+[bits 64]
+
+; declare the L3 ctx level symbols (these will then call the appropriate
+; L2 symbols)
+extern sha1_ctx_mgr_init_sse
+extern sha1_ctx_mgr_submit_sse
+extern sha1_ctx_mgr_flush_sse
+
+extern sha1_ctx_mgr_init_avx
+extern sha1_ctx_mgr_submit_avx
+extern sha1_ctx_mgr_flush_avx
+
+extern sha1_ctx_mgr_init_avx2
+extern sha1_ctx_mgr_submit_avx2
+extern sha1_ctx_mgr_flush_avx2
+
+extern sha1_ctx_mgr_init_base
+extern sha1_ctx_mgr_submit_base
+extern sha1_ctx_mgr_flush_base
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern sha1_ctx_mgr_init_avx512
+ extern sha1_ctx_mgr_submit_avx512
+ extern sha1_ctx_mgr_flush_avx512
+%endif
+
+%ifdef HAVE_AS_KNOWS_SHANI
+ extern sha1_ctx_mgr_init_sse_ni
+ extern sha1_ctx_mgr_submit_sse_ni
+ extern sha1_ctx_mgr_flush_sse_ni
+%endif
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ %ifdef HAVE_AS_KNOWS_SHANI
+  extern sha1_ctx_mgr_init_avx512_ni
+  extern sha1_ctx_mgr_submit_avx512_ni
+  extern sha1_ctx_mgr_flush_avx512_ni
+ %endif
+%endif
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface sha1_ctx_mgr_init
+mbin_interface sha1_ctx_mgr_submit
+mbin_interface sha1_ctx_mgr_flush
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ ; Reuse mbin_dispatch_init6's extension through replacing base by sse version
+ %ifdef HAVE_AS_KNOWS_SHANI
+  mbin_dispatch_base_to_avx512_shani sha1_ctx_mgr_init, sha1_ctx_mgr_init_base, \
+	sha1_ctx_mgr_init_sse, sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2, \
+	sha1_ctx_mgr_init_avx512, sha1_ctx_mgr_init_sse_ni, sha1_ctx_mgr_init_avx512_ni
+  mbin_dispatch_base_to_avx512_shani sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_base, \
+	sha1_ctx_mgr_submit_sse, sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2, \
+	sha1_ctx_mgr_submit_avx512, sha1_ctx_mgr_submit_sse_ni, sha1_ctx_mgr_submit_avx512_ni
+  mbin_dispatch_base_to_avx512_shani sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_base, \
+	sha1_ctx_mgr_flush_sse, sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2, \
+	sha1_ctx_mgr_flush_avx512, sha1_ctx_mgr_flush_sse_ni, sha1_ctx_mgr_flush_avx512_ni
+ %else
+  mbin_dispatch_init6 sha1_ctx_mgr_init, sha1_ctx_mgr_init_base, \
+	sha1_ctx_mgr_init_sse, sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2, \
+	sha1_ctx_mgr_init_avx512
+  mbin_dispatch_init6 sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_base, \
+	sha1_ctx_mgr_submit_sse, sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2, \
+	sha1_ctx_mgr_submit_avx512
+  mbin_dispatch_init6 sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_base, \
+	sha1_ctx_mgr_flush_sse, sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2, \
+	sha1_ctx_mgr_flush_avx512
+ %endif
+%else
+ %ifdef HAVE_AS_KNOWS_SHANI
+  mbin_dispatch_sse_to_avx2_shani sha1_ctx_mgr_init, sha1_ctx_mgr_init_sse, \
+	sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2, sha1_ctx_mgr_init_sse_ni
+  mbin_dispatch_sse_to_avx2_shani sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_sse, \
+	sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2, sha1_ctx_mgr_submit_sse_ni
+  mbin_dispatch_sse_to_avx2_shani sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_sse, \
+	sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2, sha1_ctx_mgr_flush_sse_ni
+ %else
+  mbin_dispatch_init sha1_ctx_mgr_init, sha1_ctx_mgr_init_sse, \
+	sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2
+  mbin_dispatch_init sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_sse, \
+	sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2
+  mbin_dispatch_init sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_sse, \
+	sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2
+ %endif
+%endif
+
+;;;       func                  core, ver, snum
+slversion sha1_ctx_mgr_init,	00,   04,  0148
+slversion sha1_ctx_mgr_submit,	00,   04,  0149
+slversion sha1_ctx_mgr_flush,	00,   04,  0150
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm
new file mode 100644
index 000000000..86d09e303
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm
@@ -0,0 +1,318 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+%endif
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define FRAMESZ	32	; space for ABCDE
+%define RSPSAVE	rax
+
+%define ABCD		xmm0
+; two E's b/c for ping-pong
+%define E0		xmm1
+%define E1		xmm2
+%define MSG0		xmm3
+%define MSG1		xmm4
+%define MSG2		xmm5
+%define MSG3		xmm6
+%define SHUF_MASK	xmm7
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR	arg0
+%define NBLK	arg1
+%define NLANX4	r10	; consistent with caller
+%define IDX	r8	; local variable -- consistent with caller
+%define DPTR	r11	; local variable -- input buffer pointer
+%define TMP	r9	; local variable -- assistant to address digest
+;%define TMP2	r8	; local variable -- assistant to address digest
+align 32
+
+; void sha1_ni_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; 		 (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: rax, r9~r11, xmm0-xmm7
+;
+mk_global sha1_ni_x1, function, internal
+sha1_ni_x1:
+	endbranch
+	mov	RSPSAVE, rsp
+	sub     rsp, FRAMESZ
+	and	rsp, ~0xF	; Align 16Bytes downward
+
+	shl	NBLK, 6		; transform blk amount into bytes
+	jz	backto_mgr
+
+	; detach idx from nlanx4
+	mov	IDX, NLANX4
+	shr	NLANX4, 8
+	and	IDX, 0xff
+
+	lea	TMP, [MGR + 4*IDX]
+	;; Initialize digest
+	pinsrd	ABCD, [TMP + 0*NLANX4], 3
+	pinsrd	ABCD, [TMP + 1*NLANX4], 2
+	pinsrd	ABCD, [TMP + 2*NLANX4], 1
+	lea	TMP, [TMP + 2*NLANX4]	; MGR + 4*IDX + 2*NLANX4
+	pinsrd	ABCD, [TMP + 1*NLANX4], 0
+	pinsrd	E0, [TMP + 2*NLANX4], 3
+	pand	E0, [IDX3_WORD_MASK]
+
+	movdqa	SHUF_MASK, [PSHUFFLE_SHANI_MASK]
+
+	;; Load input pointers
+	mov     DPTR, [MGR + _data_ptr + IDX*8]
+	;; nblk is used to indicate data end
+	add	NBLK, DPTR
+
+lloop:
+	; Save hash values for addition after rounds
+	movdqa		[rsp + 0*16], E0
+	movdqa		[rsp + 1*16], ABCD
+
+	; do rounds 0-3
+	movdqu		MSG0, [DPTR + 0*16]
+	pshufb		MSG0, SHUF_MASK
+		paddd		E0, MSG0
+		movdqa		E1, ABCD
+		sha1rnds4	ABCD, E0, 0
+
+	; do rounds 4-7
+	movdqu		MSG1, [DPTR + 1*16]
+	pshufb		MSG1, SHUF_MASK
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+		sha1rnds4	ABCD, E1, 0
+	sha1msg1	MSG0, MSG1
+
+	; do rounds 8-11
+	movdqu		MSG2, [DPTR + 2*16]
+	pshufb		MSG2, SHUF_MASK
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+		sha1rnds4	ABCD, E0, 0
+	sha1msg1	MSG1, MSG2
+	pxor		MSG0, MSG2
+
+	; do rounds 12-15
+	movdqu		MSG3, [DPTR + 3*16]
+	pshufb		MSG3, SHUF_MASK
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+	sha1msg2	MSG0, MSG3
+		sha1rnds4	ABCD, E1, 0
+	sha1msg1	MSG2, MSG3
+	pxor		MSG1, MSG3
+
+	; do rounds 16-19
+		sha1nexte	E0, MSG0
+		movdqa		E1, ABCD
+	sha1msg2	MSG1, MSG0
+		sha1rnds4	ABCD, E0, 0
+	sha1msg1	MSG3, MSG0
+	pxor		MSG2, MSG0
+
+	; do rounds 20-23
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+	sha1msg2	MSG2, MSG1
+		sha1rnds4	ABCD, E1, 1
+	sha1msg1	MSG0, MSG1
+	pxor		MSG3, MSG1
+
+	; do rounds 24-27
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+	sha1msg2	MSG3, MSG2
+		sha1rnds4	ABCD, E0, 1
+	sha1msg1	MSG1, MSG2
+	pxor		MSG0, MSG2
+
+	; do rounds 28-31
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+	sha1msg2	MSG0, MSG3
+		sha1rnds4	ABCD, E1, 1
+	sha1msg1	MSG2, MSG3
+	pxor		MSG1, MSG3
+
+	; do rounds 32-35
+		sha1nexte	E0, MSG0
+		movdqa		E1, ABCD
+	sha1msg2	MSG1, MSG0
+		sha1rnds4	ABCD, E0, 1
+	sha1msg1	MSG3, MSG0
+	pxor		MSG2, MSG0
+
+	; do rounds 36-39
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+	sha1msg2	MSG2, MSG1
+		sha1rnds4	ABCD, E1, 1
+	sha1msg1	MSG0, MSG1
+	pxor		MSG3, MSG1
+
+	; do rounds 40-43
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+	sha1msg2	MSG3, MSG2
+		sha1rnds4	ABCD, E0, 2
+	sha1msg1	MSG1, MSG2
+	pxor		MSG0, MSG2
+
+	; do rounds 44-47
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+	sha1msg2	MSG0, MSG3
+		sha1rnds4	ABCD, E1, 2
+	sha1msg1	MSG2, MSG3
+	pxor		MSG1, MSG3
+
+	; do rounds 48-51
+		sha1nexte	E0, MSG0
+		movdqa		E1, ABCD
+	sha1msg2	MSG1, MSG0
+		sha1rnds4	ABCD, E0, 2
+	sha1msg1	MSG3, MSG0
+	pxor		MSG2, MSG0
+
+	; do rounds 52-55
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+	sha1msg2	MSG2, MSG1
+		sha1rnds4	ABCD, E1, 2
+	sha1msg1	MSG0, MSG1
+	pxor		MSG3, MSG1
+
+	; do rounds 56-59
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+	sha1msg2	MSG3, MSG2
+		sha1rnds4	ABCD, E0, 2
+	sha1msg1	MSG1, MSG2
+	pxor		MSG0, MSG2
+
+	; do rounds 60-63
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+	sha1msg2	MSG0, MSG3
+		sha1rnds4	ABCD, E1, 3
+	sha1msg1	MSG2, MSG3
+	pxor		MSG1, MSG3
+
+	; do rounds 64-67
+		sha1nexte	E0, MSG0
+		movdqa		E1, ABCD
+	sha1msg2	MSG1, MSG0
+		sha1rnds4	ABCD, E0, 3
+	sha1msg1	MSG3, MSG0
+	pxor		MSG2, MSG0
+
+	; do rounds 68-71
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+	sha1msg2	MSG2, MSG1
+		sha1rnds4	ABCD, E1, 3
+	pxor		MSG3, MSG1
+
+	; do rounds 72-75
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+	sha1msg2	MSG3, MSG2
+		sha1rnds4	ABCD, E0, 3
+
+	; do rounds 76-79
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+		sha1rnds4	ABCD, E1, 3
+
+	; Add current hash values with previously saved
+	sha1nexte	E0, [rsp + 0*16]
+	paddd		ABCD, [rsp + 1*16]
+
+	; Increment data pointer and loop if more to process
+	add		DPTR, 64
+	cmp		DPTR, NBLK
+	jne		lloop
+
+	; write out digests
+	lea	TMP, [MGR + 4*IDX]
+	pextrd	[TMP + 0*NLANX4], ABCD, 3
+	pextrd	[TMP + 1*NLANX4], ABCD, 2
+	pextrd	[TMP + 2*NLANX4], ABCD, 1
+	lea	TMP, [TMP + 2*NLANX4]	; MGR + 4*IDX + 2*NLANX4
+	pextrd	[TMP + 1*NLANX4], ABCD, 0
+	pextrd	[TMP + 2*NLANX4], E0, 3
+
+	; update input pointers
+	mov     [MGR + _data_ptr + IDX*8], DPTR
+
+backto_mgr:
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+	mov     rsp, RSPSAVE
+
+	ret
+
+
+section .data align=16
+PSHUFFLE_SHANI_MASK:	dq 0x08090a0b0c0d0e0f, 0x0001020304050607
+IDX3_WORD_MASK:		dq 0x0000000000000000, 0xFFFFFFFF00000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_ni_x1
+no_sha1_ni_x1:
+%endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm
new file mode 100644
index 000000000..7b0ddb74e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm
@@ -0,0 +1,484 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+%endif
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define FRAMESZ	64	; space for ABCDE
+%define RSPSAVE	rax
+
+%define ABCD		xmm0
+; two E's b/c for ping-pong
+%define E0		xmm1
+%define E1		xmm2
+%define MSG0		xmm3
+%define MSG1		xmm4
+%define MSG2		xmm5
+%define MSG3		xmm6
+
+%define ABCDb		xmm7
+%define E0b		xmm8	; Need two E's b/c they ping pong
+%define E1b		xmm9
+%define MSG0b		xmm10
+%define MSG1b		xmm11
+%define MSG2b		xmm12
+%define MSG3b		xmm13
+
+%define SHUF_MASK	xmm14
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR	arg0
+
+%define NBLK	arg1
+%define NLANX4	r10	; consistent with caller
+%define IDX	r8	; local variable -- consistent with caller
+%define DPTR	r11	; local variable -- input buffer pointer
+%define DPTRb	r12	;
+%define TMP	r9	; local variable -- assistant to address digest
+%define TMPb	r13	; local variable -- assistant to address digest
+align 32
+
+; void sha1_ni_x2(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; 		 (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: rax, r9~r13, xmm0-xmm14
+;
+mk_global sha1_ni_x2, function, internal
+sha1_ni_x2:
+	endbranch
+	mov	RSPSAVE, rsp
+	sub     rsp, FRAMESZ
+	and	rsp, ~0xF	; Align 16Bytes downward
+
+	shl	NBLK, 6		; transform blk amount into bytes
+	jz	backto_mgr
+
+	; detach idx from nlanx4
+	mov	IDX, NLANX4
+	shr	NLANX4, 8
+	and	IDX, 0xff
+
+	lea	TMP, [MGR + _args_digest ];
+	lea 	TMPb,[MGR + _args_digest + 4*1];
+
+	;; Initialize digest
+	pinsrd	ABCD, [TMP + 0*NLANX4], 3
+	pinsrd	ABCD, [TMP + 1*NLANX4], 2
+	pinsrd	ABCD, [TMP + 2*NLANX4], 1
+	lea	TMP, [TMP + 2*NLANX4]	; MGR + 4*IDX + 2*NLANX4
+	pinsrd	ABCD, [TMP + 1*NLANX4], 0
+	pinsrd	E0, [TMP + 2*NLANX4], 3
+	pand	E0, [IDX3_WORD_MASK]
+
+	pinsrd	ABCDb, [TMPb + 0*NLANX4], 3
+	pinsrd	ABCDb, [TMPb + 1*NLANX4], 2
+	pinsrd	ABCDb, [TMPb + 2*NLANX4], 1
+	lea	TMPb, [TMPb + 2*NLANX4]	; MGR + 4*IDX + 2*NLANX4
+	pinsrd	ABCDb, [TMPb + 1*NLANX4], 0
+	pinsrd	E0b, [TMPb + 2*NLANX4], 3
+	pand	E0b, [IDX3_WORD_MASK]
+
+	movdqa	SHUF_MASK, [PSHUFFLE_SHANI_MASK]
+
+	;; Load input pointers
+	mov     DPTR, [MGR + _data_ptr ]
+	mov 	DPTRb,[MGR + _data_ptr + 8*1]
+	;; nblk is used to indicate data end
+	add	NBLK, DPTR
+
+lloop:
+	movdqa		[rsp + 0*16], E0
+	movdqa		[rsp + 1*16], ABCD
+
+	movdqa		[rsp + 2*16], E0b
+	movdqa		[rsp + 3*16], ABCDb
+
+	; do rounds 0-3
+	movdqu		MSG0, [DPTR + 0*16]
+	pshufb		MSG0, SHUF_MASK
+		paddd		E0, MSG0
+		movdqa		E1, ABCD
+		sha1rnds4	ABCD, E0, 0
+
+	movdqu		MSG0b, [DPTRb + 0*16]
+	pshufb		MSG0b, SHUF_MASK
+		paddd		E0b, MSG0b
+		movdqa		E1b, ABCDb
+		sha1rnds4	ABCDb, E0b, 0
+
+	; do rounds 4-7
+	movdqu		MSG1, [DPTR + 1*16]
+	pshufb		MSG1, SHUF_MASK
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+		sha1rnds4	ABCD, E1, 0
+	sha1msg1	MSG0, MSG1
+
+	movdqu		MSG1b, [DPTRb + 1*16]
+	pshufb		MSG1b, SHUF_MASK
+		sha1nexte	E1b, MSG1b
+		movdqa		E0b, ABCDb
+		sha1rnds4	ABCDb, E1b, 0
+	sha1msg1	MSG0b, MSG1b
+
+	; do rounds 8-11
+	movdqu		MSG2, [DPTR + 2*16]
+	pshufb		MSG2, SHUF_MASK
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+		sha1rnds4	ABCD, E0, 0
+	sha1msg1	MSG1, MSG2
+	pxor		MSG0, MSG2
+
+	movdqu		MSG2b, [DPTRb + 2*16]
+	pshufb		MSG2b, SHUF_MASK
+		sha1nexte	E0b, MSG2b
+		movdqa		E1b, ABCDb
+		sha1rnds4	ABCDb, E0b, 0
+	sha1msg1	MSG1b, MSG2b
+	pxor		MSG0b, MSG2b
+
+	; do rounds 12-15
+	movdqu		MSG3, [DPTR + 3*16]
+	pshufb		MSG3, SHUF_MASK
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+	sha1msg2	MSG0, MSG3
+		sha1rnds4	ABCD, E1, 0
+	sha1msg1	MSG2, MSG3
+	pxor		MSG1, MSG3
+
+	movdqu		MSG3b, [DPTRb + 3*16]
+	pshufb		MSG3b, SHUF_MASK
+		sha1nexte	E1b, MSG3b
+		movdqa		E0b, ABCDb
+	sha1msg2	MSG0b, MSG3b
+		sha1rnds4	ABCDb, E1b, 0
+	sha1msg1	MSG2b, MSG3b
+	pxor		MSG1b, MSG3b
+
+	; do rounds 16-19
+		sha1nexte	E0, MSG0
+		movdqa		E1, ABCD
+	sha1msg2	MSG1, MSG0
+		sha1rnds4	ABCD, E0, 0
+	sha1msg1	MSG3, MSG0
+	pxor		MSG2, MSG0
+
+	sha1nexte	E0b, MSG0b
+		movdqa		E1b, ABCDb
+	sha1msg2	MSG1b, MSG0b
+		sha1rnds4	ABCDb, E0b, 0
+	sha1msg1	MSG3b, MSG0b
+	pxor		MSG2b, MSG0b
+
+	; do rounds 20-23
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+	sha1msg2	MSG2, MSG1
+		sha1rnds4	ABCD, E1, 1
+	sha1msg1	MSG0, MSG1
+	pxor		MSG3, MSG1
+
+	sha1nexte	E1b, MSG1b
+		movdqa		E0b, ABCDb
+	sha1msg2	MSG2b, MSG1b
+		sha1rnds4	ABCDb, E1b, 1
+	sha1msg1	MSG0b, MSG1b
+	pxor		MSG3b, MSG1b
+
+	; do rounds 24-27
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+	sha1msg2	MSG3, MSG2
+		sha1rnds4	ABCD, E0, 1
+	sha1msg1	MSG1, MSG2
+	pxor		MSG0, MSG2
+
+	sha1nexte	E0b, MSG2b
+		movdqa		E1b, ABCDb
+	sha1msg2	MSG3b, MSG2b
+		sha1rnds4	ABCDb, E0b, 1
+	sha1msg1	MSG1b, MSG2b
+	pxor		MSG0b, MSG2b
+
+	; do rounds 28-31
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+	sha1msg2	MSG0, MSG3
+		sha1rnds4	ABCD, E1, 1
+	sha1msg1	MSG2, MSG3
+	pxor		MSG1, MSG3
+
+		sha1nexte	E1b, MSG3b
+		movdqa		E0b, ABCDb
+	sha1msg2	MSG0b, MSG3b
+		sha1rnds4	ABCDb, E1b, 1
+	sha1msg1	MSG2b, MSG3b
+	pxor		MSG1b, MSG3b
+
+	; do rounds 32-35
+		sha1nexte	E0, MSG0
+		movdqa		E1, ABCD
+	sha1msg2	MSG1, MSG0
+		sha1rnds4	ABCD, E0, 1
+	sha1msg1	MSG3, MSG0
+	pxor		MSG2, MSG0
+
+	sha1nexte	E0b, MSG0b
+		movdqa		E1b, ABCDb
+	sha1msg2	MSG1b, MSG0b
+		sha1rnds4	ABCDb, E0b, 1
+	sha1msg1	MSG3b, MSG0b
+	pxor		MSG2b, MSG0b
+
+	; do rounds 36-39
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+	sha1msg2	MSG2, MSG1
+		sha1rnds4	ABCD, E1, 1
+	sha1msg1	MSG0, MSG1
+	pxor		MSG3, MSG1
+
+	sha1nexte	E1b, MSG1b
+		movdqa		E0b, ABCDb
+	sha1msg2	MSG2b, MSG1b
+		sha1rnds4	ABCDb, E1b, 1
+	sha1msg1	MSG0b, MSG1b
+	pxor		MSG3b, MSG1b
+
+	; do rounds 40-43
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+	sha1msg2	MSG3, MSG2
+		sha1rnds4	ABCD, E0, 2
+	sha1msg1	MSG1, MSG2
+	pxor		MSG0, MSG2
+
+	sha1nexte	E0b, MSG2b
+		movdqa		E1b, ABCDb
+	sha1msg2	MSG3b, MSG2b
+		sha1rnds4	ABCDb, E0b, 2
+	sha1msg1	MSG1b, MSG2b
+	pxor		MSG0b, MSG2b
+
+	; do rounds 44-47
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+	sha1msg2	MSG0, MSG3
+		sha1rnds4	ABCD, E1, 2
+	sha1msg1	MSG2, MSG3
+	pxor		MSG1, MSG3
+
+	sha1nexte	E1b, MSG3b
+		movdqa		E0b, ABCDb
+	sha1msg2	MSG0b, MSG3b
+		sha1rnds4	ABCDb, E1b, 2
+	sha1msg1	MSG2b, MSG3b
+	pxor		MSG1b, MSG3b
+
+	; do rounds 48-51
+		sha1nexte	E0, MSG0
+		movdqa		E1, ABCD
+	sha1msg2	MSG1, MSG0
+		sha1rnds4	ABCD, E0, 2
+	sha1msg1	MSG3, MSG0
+	pxor		MSG2, MSG0
+		sha1nexte	E0b, MSG0b
+		movdqa		E1b, ABCDb
+	sha1msg2	MSG1b, MSG0b
+		sha1rnds4	ABCDb, E0b, 2
+	sha1msg1	MSG3b, MSG0b
+	pxor		MSG2b, MSG0b
+
+	; do rounds 52-55
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+	sha1msg2	MSG2, MSG1
+		sha1rnds4	ABCD, E1, 2
+	sha1msg1	MSG0, MSG1
+	pxor		MSG3, MSG1
+	sha1nexte	E1b, MSG1b
+		movdqa		E0b, ABCDb
+	sha1msg2	MSG2b, MSG1b
+		sha1rnds4	ABCDb, E1b, 2
+	sha1msg1	MSG0b, MSG1b
+	pxor		MSG3b, MSG1b
+
+	; do rounds 56-59
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+	sha1msg2	MSG3, MSG2
+		sha1rnds4	ABCD, E0, 2
+	sha1msg1	MSG1, MSG2
+	pxor		MSG0, MSG2
+
+	sha1nexte	E0b, MSG2b
+		movdqa		E1b, ABCDb
+	sha1msg2	MSG3b, MSG2b
+		sha1rnds4	ABCDb, E0b, 2
+	sha1msg1	MSG1b, MSG2b
+	pxor		MSG0b, MSG2b
+
+	; do rounds 60-63
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+	sha1msg2	MSG0, MSG3
+		sha1rnds4	ABCD, E1, 3
+	sha1msg1	MSG2, MSG3
+	pxor		MSG1, MSG3
+
+	sha1nexte	E1b, MSG3b
+		movdqa		E0b, ABCDb
+	sha1msg2	MSG0b, MSG3b
+		sha1rnds4	ABCDb, E1b, 3
+	sha1msg1	MSG2b, MSG3b
+	pxor		MSG1b, MSG3b
+
+	; do rounds 64-67
+		sha1nexte	E0, MSG0
+		movdqa		E1, ABCD
+	sha1msg2	MSG1, MSG0
+		sha1rnds4	ABCD, E0, 3
+	sha1msg1	MSG3, MSG0
+	pxor		MSG2, MSG0
+
+	sha1nexte	E0b, MSG0b
+		movdqa		E1b, ABCDb
+	sha1msg2	MSG1b, MSG0b
+		sha1rnds4	ABCDb, E0b, 3
+	sha1msg1	MSG3b, MSG0b
+	pxor		MSG2b, MSG0b
+
+	; do rounds 68-71
+		sha1nexte	E1, MSG1
+		movdqa		E0, ABCD
+	sha1msg2	MSG2, MSG1
+		sha1rnds4	ABCD, E1, 3
+	pxor		MSG3, MSG1
+
+	sha1nexte	E1b, MSG1b
+		movdqa		E0b, ABCDb
+	sha1msg2	MSG2b, MSG1b
+		sha1rnds4	ABCDb, E1b, 3
+	pxor		MSG3b, MSG1b
+
+	; do rounds 72-75
+		sha1nexte	E0, MSG2
+		movdqa		E1, ABCD
+	sha1msg2	MSG3, MSG2
+		sha1rnds4	ABCD, E0, 3
+
+		sha1nexte	E0b, MSG2b
+		movdqa		E1b, ABCDb
+	sha1msg2	MSG3b, MSG2b
+		sha1rnds4	ABCDb, E0b, 3
+
+	; do rounds 76-79
+		sha1nexte	E1, MSG3
+		movdqa		E0, ABCD
+		sha1rnds4	ABCD, E1, 3
+
+		sha1nexte	E1b, MSG3b
+		movdqa		E0b, ABCDb
+		sha1rnds4	ABCDb, E1b, 3
+
+	; Add current hash values with previously saved
+	sha1nexte	E0, [rsp + 0*16]
+	paddd		ABCD, [rsp + 1*16]
+
+	sha1nexte	E0b, [rsp + 2*16]
+	paddd		ABCDb, [rsp + 3*16]
+
+	; Increment data pointer and loop if more to process
+	add		DPTR, 64
+	add 		DPTRb, 64
+	cmp		DPTR, NBLK
+	jne		lloop
+
+	; write out digests
+	lea	TMP, [MGR + _args_digest]
+	pextrd	[TMP + 0*NLANX4], ABCD, 3
+	pextrd	[TMP + 1*NLANX4], ABCD, 2
+	pextrd	[TMP + 2*NLANX4], ABCD, 1
+	lea	TMP, [TMP + 2*NLANX4]	; MGR + 4*IDX + 2*NLANX4
+	pextrd	[TMP + 1*NLANX4], ABCD, 0
+	pextrd	[TMP + 2*NLANX4], E0, 3
+
+	lea	TMPb, [MGR +_args_digest + 4*1]
+	pextrd	[TMPb + 0*NLANX4], ABCDb, 3
+	pextrd	[TMPb + 1*NLANX4], ABCDb, 2
+	pextrd	[TMPb + 2*NLANX4], ABCDb, 1
+	lea	TMPb, [TMPb + 2*NLANX4]	; MGR + 4*IDX + 2*NLANX4
+	pextrd	[TMPb + 1*NLANX4], ABCDb, 0
+	pextrd	[TMPb + 2*NLANX4], E0b, 3
+
+	; update input pointers
+	mov     [MGR + _data_ptr], DPTR
+	mov 	[MGR + _data_ptr + 8*1], DPTRb
+
+backto_mgr:
+;;;;;;;;;;;;;;;;
+;; Postamble
+
+	mov     rsp, RSPSAVE
+
+	ret
+
+section .data align=16
+PSHUFFLE_SHANI_MASK:	dq 0x08090a0b0c0d0e0f, 0x0001020304050607
+IDX3_WORD_MASK:		dq 0x0000000000000000, 0xFFFFFFFF00000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_ni_x2
+no_sha1_ni_x2:
+%endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm
new file mode 100644
index 000000000..aeb00a008
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm
@@ -0,0 +1,485 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+%endif
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+_GPR_SAVE_SIZE  equ 8*9	;rbx, rdx, rbp, (rdi, rsi), r12~r15
+_WK_SAVE_SIZE	equ 16*4
+
+_WK_SAVE	equ 0
+_GPR_SAVE	equ _WK_SAVE + _WK_SAVE_SIZE
+STACK_SPACE	equ _GPR_SAVE + _GPR_SAVE_SIZE
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR	arg0
+%define NBLK	arg1
+%define NLANX4	r10	; consistent with caller
+; rax~rdx, rsi, rdi, rbp are used for RR
+%define N_MGR	r8
+%define IDX	r9	; local variable -- consistent with caller
+%define K_BASE	r11
+%define BUFFER_PTR r12
+%define BUFFER_END r13
+%define TMP	r14	; local variable -- assistant to address digest
+
+%xdefine W_TMP  xmm0
+%xdefine W_TMP2 xmm9
+
+%xdefine W0  xmm1
+%xdefine W4  xmm2
+%xdefine W8  xmm3
+%xdefine W12 xmm4
+%xdefine W16 xmm5
+%xdefine W20 xmm6
+%xdefine W24 xmm7
+%xdefine W28 xmm8
+
+%xdefine XMM_SHUFB_BSWAP xmm10
+
+;; we keep window of 64 w[i]+K pre-calculated values in a circular buffer
+%xdefine WK(t) (rsp + (t & 15)*4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Constants
+
+%xdefine K1 0x5a827999
+%xdefine K2 0x6ed9eba1
+%xdefine K3 0x8f1bbcdc
+%xdefine K4 0xca62c1d6
+
+%xdefine W_PRECALC_AHEAD   16
+%xdefine W_NO_TAIL_PRECALC 0
+
+; Rounds macros
+
+%macro REGALLOC 0
+    %xdefine A ecx
+    %xdefine B esi
+    %xdefine C edi
+    %xdefine D ebp
+    %xdefine E edx
+
+    %xdefine T1 eax
+    %xdefine T2 ebx
+%endmacro
+
+%macro F1 3
+        mov T1,%2
+        xor T1,%3
+        and T1,%1
+        xor T1,%3
+%endmacro
+
+%macro F2 3
+        mov T1,%3
+        xor T1,%2
+        xor T1,%1
+%endmacro
+
+%macro F3 3
+        mov T1,%2
+        mov T2,%1
+        or  T1,%1
+        and T2,%2
+        and T1,%3
+        or  T1,T2
+%endmacro
+
+%define F4 F2
+
+%macro UPDATE_HASH 2
+     add %2, %1
+     mov %1, %2
+%endmacro
+
+
+%macro W_PRECALC 1
+    %xdefine i (%1)
+
+    %if (i < 20)
+        %xdefine K_XMM  0
+    %elif (i < 40)
+        %xdefine K_XMM  16
+    %elif (i < 60)
+        %xdefine K_XMM  32
+    %else
+        %xdefine K_XMM  48
+    %endif
+
+    %if (i<16 || (i>=80 && i<(80 + W_PRECALC_AHEAD)))
+
+      %if (W_NO_TAIL_PRECALC == 0)
+
+        %xdefine i ((%1) % 80)        ;; pre-compute for the next iteration
+
+        %if (i == 0)
+          W_PRECALC_RESET
+        %endif
+
+
+        W_PRECALC_00_15
+      %endif
+
+    %elif (i < 32)
+        W_PRECALC_16_31
+    %elif (i < 80)   ;; rounds 32-79
+        W_PRECALC_32_79
+    %endif
+%endmacro
+
+%macro W_PRECALC_RESET 0
+    %xdefine    W             W0
+    %xdefine    W_minus_04    W4
+    %xdefine    W_minus_08    W8
+    %xdefine    W_minus_12    W12
+    %xdefine    W_minus_16    W16
+    %xdefine    W_minus_20    W20
+    %xdefine    W_minus_24    W24
+    %xdefine    W_minus_28    W28
+    %xdefine    W_minus_32    W
+%endmacro
+
+%macro W_PRECALC_ROTATE 0
+    %xdefine    W_minus_32    W_minus_28
+    %xdefine    W_minus_28    W_minus_24
+    %xdefine    W_minus_24    W_minus_20
+    %xdefine    W_minus_20    W_minus_16
+    %xdefine    W_minus_16    W_minus_12
+    %xdefine    W_minus_12    W_minus_08
+    %xdefine    W_minus_08    W_minus_04
+    %xdefine    W_minus_04    W
+    %xdefine    W             W_minus_32
+%endmacro
+
+%macro W_PRECALC_00_15 0
+      ;; message scheduling pre-compute for rounds 0-15
+  %if ((i & 3) == 0)       ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
+    movdqu W_TMP, [BUFFER_PTR + (i * 4)]
+  %elif ((i & 3) == 1)
+    pshufb W_TMP, XMM_SHUFB_BSWAP
+    movdqa W, W_TMP
+  %elif ((i & 3) == 2)
+    paddd  W_TMP, [K_BASE]
+  %elif ((i & 3) == 3)
+    movdqa  [WK(i&~3)], W_TMP
+
+    W_PRECALC_ROTATE
+  %endif
+%endmacro
+
+%macro W_PRECALC_16_31 0
+      ;; message scheduling pre-compute for rounds 16-31
+      ;; calculating last 32 w[i] values in 8 XMM registers
+      ;; pre-calculate K+w[i] values and store to mem, for later load by ALU add instruction
+      ;;
+      ;; "brute force" vectorization for rounds 16-31 only due to w[i]->w[i-3] dependency
+      ;;
+  %if ((i & 3) == 0)    ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
+    movdqa  W, W_minus_12
+    palignr W, W_minus_16, 8       ;; w[i-14]
+    movdqa  W_TMP, W_minus_04
+    psrldq  W_TMP, 4               ;; w[i-3]
+    pxor    W, W_minus_08
+  %elif ((i & 3) == 1)
+    pxor    W_TMP, W_minus_16
+    pxor    W, W_TMP
+    movdqa  W_TMP2, W
+    movdqa  W_TMP, W
+    pslldq  W_TMP2, 12
+  %elif ((i & 3) == 2)
+    psrld   W, 31
+    pslld   W_TMP, 1
+    por     W_TMP, W
+    movdqa  W, W_TMP2
+    psrld   W_TMP2, 30
+    pslld   W, 2
+  %elif ((i & 3) == 3)
+    pxor    W_TMP, W
+    pxor    W_TMP, W_TMP2
+    movdqa  W, W_TMP
+    paddd   W_TMP, [K_BASE + K_XMM]
+    movdqa  [WK(i&~3)],W_TMP
+
+    W_PRECALC_ROTATE
+  %endif
+%endmacro
+
+%macro W_PRECALC_32_79 0
+    ;; in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
+    ;; instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+    ;; allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
+    ;;
+  %if ((i & 3) == 0)    ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
+    movdqa  W_TMP, W_minus_04
+    pxor    W, W_minus_28         ;; W is W_minus_32 before xor
+    palignr W_TMP, W_minus_08, 8
+  %elif ((i & 3) == 1)
+    pxor    W, W_minus_16
+    pxor    W, W_TMP
+    movdqa  W_TMP, W
+  %elif ((i & 3) == 2)
+    psrld   W, 30
+    pslld   W_TMP, 2
+    por     W_TMP, W
+  %elif ((i & 3) == 3)
+    movdqa  W, W_TMP
+    paddd   W_TMP, [K_BASE + K_XMM]
+    movdqa  [WK(i&~3)],W_TMP
+
+    W_PRECALC_ROTATE
+  %endif
+%endmacro
+
+%macro RR 6             ;; RR does two rounds of SHA-1 back to back with W pre-calculation
+
+   ;;     TEMP = A
+   ;;     A = F( i, B, C, D ) + E + ROTATE_LEFT( A, 5 ) + W[i] + K(i)
+   ;;     C = ROTATE_LEFT( B, 30 )
+   ;;     D = C
+   ;;     E = D
+   ;;     B = TEMP
+
+    W_PRECALC (%6 + W_PRECALC_AHEAD)
+    F    %2, %3, %4     ;; F returns result in T1
+    add  %5, [WK(%6)]
+    rol  %2, 30
+    mov  T2, %1
+    add  %4, [WK(%6 + 1)]
+    rol  T2, 5
+    add  %5, T1
+
+    W_PRECALC (%6 + W_PRECALC_AHEAD + 1)
+    add  T2, %5
+    mov  %5, T2
+    rol  T2, 5
+    add  %4, T2
+    F    %1, %2, %3    ;; F returns result in T1
+    add  %4, T1
+    rol  %1, 30
+
+;; write:  %1, %2
+;; rotate: %1<=%4, %2<=%5, %3<=%1, %4<=%2, %5<=%3
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; void sha1_opt_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; 		 (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: all general regs (except r15), xmm0-xmm10
+;	{rbx, rdx, rbp, (rdi, rsi), r12~r15 are saved on stack}
+;
+mk_global sha1_opt_x1, function, internal
+sha1_opt_x1:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*1], rbp
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*2], rdi
+	mov     [rsp + _GPR_SAVE + 8*3], rsi
+	; caller has already stored XMM6~10
+%endif
+	mov     [rsp + _GPR_SAVE + 8*4], r12
+	mov     [rsp + _GPR_SAVE + 8*5], r13
+	mov     [rsp + _GPR_SAVE + 8*6], r14
+	mov     [rsp + _GPR_SAVE + 8*7], r15
+	mov     [rsp + _GPR_SAVE + 8*8], rdx
+
+
+	shl	NBLK, 6		; transform blk amount into bytes
+	jz	.lend
+	; detach idx from nlanx4
+	mov	IDX, NLANX4
+	shr	NLANX4, 8
+	and	IDX, 0xff
+
+	;; let sha1_opt sb takes over r8~r11
+	;; Load input pointers
+	mov	N_MGR, MGR
+	mov     BUFFER_PTR, [MGR + _data_ptr + IDX*8]
+	;; nblk is used to indicate data end
+	add	NBLK, BUFFER_PTR
+        mov     BUFFER_END, NBLK
+
+        lea     K_BASE, [K_XMM_AR]
+        movdqu	XMM_SHUFB_BSWAP, [bswap_shufb_ctl]
+
+        REGALLOC
+
+	lea	TMP, [N_MGR + 4*IDX]
+	;; Initialize digest
+	mov	A, [TMP + 0*NLANX4]
+	mov	B, [TMP + 1*NLANX4]
+	mov	C, [TMP + 2*NLANX4]
+	lea	TMP, [TMP + 2*NLANX4]	; MGR + 4*IDX + 2*NLANX4
+	mov	D, [TMP + 1*NLANX4]
+	mov	E, [TMP + 2*NLANX4]
+
+  %assign i 0
+  %rep    W_PRECALC_AHEAD
+        W_PRECALC i
+  %assign i i+1
+  %endrep
+
+  %xdefine F F1
+
+.lloop:
+        cmp BUFFER_PTR, K_BASE          ;; we use K_BASE value as a signal of a last block,
+        jne .lbegin                    ;; it is set below by: cmovae BUFFER_PTR, K_BASE
+        jmp .lend
+
+.lbegin:
+        RR A,B,C,D,E,0
+        RR D,E,A,B,C,2
+        RR B,C,D,E,A,4
+        RR E,A,B,C,D,6
+        RR C,D,E,A,B,8
+
+        RR A,B,C,D,E,10
+        RR D,E,A,B,C,12
+        RR B,C,D,E,A,14
+        RR E,A,B,C,D,16
+        RR C,D,E,A,B,18
+
+  %xdefine F F2
+
+        RR A,B,C,D,E,20
+        RR D,E,A,B,C,22
+        RR B,C,D,E,A,24
+        RR E,A,B,C,D,26
+        RR C,D,E,A,B,28
+
+        RR A,B,C,D,E,30
+        RR D,E,A,B,C,32
+        RR B,C,D,E,A,34
+        RR E,A,B,C,D,36
+        RR C,D,E,A,B,38
+
+  %xdefine F F3
+
+        RR A,B,C,D,E,40
+        RR D,E,A,B,C,42
+        RR B,C,D,E,A,44
+        RR E,A,B,C,D,46
+        RR C,D,E,A,B,48
+
+        RR A,B,C,D,E,50
+        RR D,E,A,B,C,52
+        RR B,C,D,E,A,54
+        RR E,A,B,C,D,56
+        RR C,D,E,A,B,58
+
+  %xdefine F F4
+
+        add   BUFFER_PTR, 64            ;; move to next 64-byte block
+        cmp   BUFFER_PTR, BUFFER_END    ;; check if current block is the last one
+        cmovae BUFFER_PTR, K_BASE       ;; smart way to signal the last iteration
+
+        RR A,B,C,D,E,60
+        RR D,E,A,B,C,62
+        RR B,C,D,E,A,64
+        RR E,A,B,C,D,66
+        RR C,D,E,A,B,68
+
+        RR A,B,C,D,E,70
+        RR D,E,A,B,C,72
+        RR B,C,D,E,A,74
+        RR E,A,B,C,D,76
+        RR C,D,E,A,B,78
+
+	lea	TMP, [N_MGR + 4*IDX]
+        UPDATE_HASH [TMP + 0*NLANX4],A
+        UPDATE_HASH [TMP + 1*NLANX4],B
+        UPDATE_HASH [TMP + 2*NLANX4],C
+	lea	TMP, [TMP + 2*NLANX4]	; MGR + 4*IDX + 2*NLANX4
+        UPDATE_HASH [TMP + 1*NLANX4],D
+        UPDATE_HASH [TMP + 2*NLANX4],E
+
+        jmp .lloop
+
+    .lend:
+	mov	MGR, N_MGR
+
+	mov     rdx, [rsp + _GPR_SAVE + 8*8]
+	mov     r15, [rsp + _GPR_SAVE + 8*7]
+	mov     r14, [rsp + _GPR_SAVE + 8*6]
+	mov     r13, [rsp + _GPR_SAVE + 8*5]
+	mov     r12, [rsp + _GPR_SAVE + 8*4]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rsi, [rsp + _GPR_SAVE + 8*3]
+	mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbp, [rsp + _GPR_SAVE + 8*1]
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	add     rsp, STACK_SPACE
+
+	ret
+
+
+;;----------------------
+section .data align=64
+
+align 128
+K_XMM_AR:
+    DD K1, K1, K1, K1
+    DD K2, K2, K2, K2
+    DD K3, K3, K3, K3
+    DD K4, K4, K4, K4
+
+align 16
+bswap_shufb_ctl:
+    DD 00010203h
+    DD 04050607h
+    DD 08090a0bh
+    DD 0c0d0e0fh
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c
new file mode 100644
index 000000000..e82fb30fe
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c
@@ -0,0 +1,220 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha1_mb.h"
+#include "endian_helper.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA1 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define H0 0x67452301
+#define H1 0xefcdab89
+#define H2 0x98badcfe
+#define H3 0x10325476
+#define H4 0xc3d2e1f0
+
+#define F1(b,c,d) (d ^ (b & (c ^ d)))
+#define F2(b,c,d) (b ^ c ^ d)
+#define F3(b,c,d) ((b & c) | (d & (b | c)))
+#define F4(b,c,d) (b ^ c ^ d)
+
+#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r))))
+
+#define W(x) w[(x) & 15]
+
+#define step00_19(i,a,b,c,d,e) \
+	if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+	else W(i) = to_be32(ww[i]); \
+	e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \
+	b = rol32(b,30)
+
+#define step20_39(i,a,b,c,d,e) \
+	W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+	e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \
+	b = rol32(b,30)
+
+#define step40_59(i,a,b,c,d,e) \
+	W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+	e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \
+	b = rol32(b,30)
+
+#define step60_79(i,a,b,c,d,e) \
+	W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+	e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \
+	b = rol32(b,30)
+
+static void OPT_FIX sha1_single(const uint8_t * data, uint32_t digest[]);
+
+void sha1_ref(const uint8_t * input_data, uint32_t * digest, const uint32_t len)
+{
+	uint32_t i, j;
+	uint8_t buf[2 * SHA1_BLOCK_SIZE];
+
+	digest[0] = H0;
+	digest[1] = H1;
+	digest[2] = H2;
+	digest[3] = H3;
+	digest[4] = H4;
+
+	i = len;
+	while (i >= SHA1_BLOCK_SIZE) {
+		sha1_single(input_data, digest);
+		input_data += SHA1_BLOCK_SIZE;
+		i -= SHA1_BLOCK_SIZE;
+	}
+
+	memcpy(buf, input_data, i);
+	buf[i++] = 0x80;
+	for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - SHA1_PADLENGTHFIELD_SIZE); j++)
+		buf[j] = 0;
+
+	if (i > SHA1_BLOCK_SIZE - SHA1_PADLENGTHFIELD_SIZE)
+		i = 2 * SHA1_BLOCK_SIZE;
+	else
+		i = SHA1_BLOCK_SIZE;
+
+	*(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+	sha1_single(buf, digest);
+	if (i == (2 * SHA1_BLOCK_SIZE))
+		sha1_single(buf + SHA1_BLOCK_SIZE, digest);
+}
+
+void sha1_single(const uint8_t * data, uint32_t digest[])
+{
+	uint32_t a, b, c, d, e;
+	uint32_t w[16] = { 0 };
+	uint32_t *ww = (uint32_t *) data;
+
+	a = digest[0];
+	b = digest[1];
+	c = digest[2];
+	d = digest[3];
+	e = digest[4];
+
+	step00_19(0, a, b, c, d, e);
+	step00_19(1, e, a, b, c, d);
+	step00_19(2, d, e, a, b, c);
+	step00_19(3, c, d, e, a, b);
+	step00_19(4, b, c, d, e, a);
+	step00_19(5, a, b, c, d, e);
+	step00_19(6, e, a, b, c, d);
+	step00_19(7, d, e, a, b, c);
+	step00_19(8, c, d, e, a, b);
+	step00_19(9, b, c, d, e, a);
+	step00_19(10, a, b, c, d, e);
+	step00_19(11, e, a, b, c, d);
+	step00_19(12, d, e, a, b, c);
+	step00_19(13, c, d, e, a, b);
+	step00_19(14, b, c, d, e, a);
+	step00_19(15, a, b, c, d, e);
+	step00_19(16, e, a, b, c, d);
+	step00_19(17, d, e, a, b, c);
+	step00_19(18, c, d, e, a, b);
+	step00_19(19, b, c, d, e, a);
+
+	step20_39(20, a, b, c, d, e);
+	step20_39(21, e, a, b, c, d);
+	step20_39(22, d, e, a, b, c);
+	step20_39(23, c, d, e, a, b);
+	step20_39(24, b, c, d, e, a);
+	step20_39(25, a, b, c, d, e);
+	step20_39(26, e, a, b, c, d);
+	step20_39(27, d, e, a, b, c);
+	step20_39(28, c, d, e, a, b);
+	step20_39(29, b, c, d, e, a);
+	step20_39(30, a, b, c, d, e);
+	step20_39(31, e, a, b, c, d);
+	step20_39(32, d, e, a, b, c);
+	step20_39(33, c, d, e, a, b);
+	step20_39(34, b, c, d, e, a);
+	step20_39(35, a, b, c, d, e);
+	step20_39(36, e, a, b, c, d);
+	step20_39(37, d, e, a, b, c);
+	step20_39(38, c, d, e, a, b);
+	step20_39(39, b, c, d, e, a);
+
+	step40_59(40, a, b, c, d, e);
+	step40_59(41, e, a, b, c, d);
+	step40_59(42, d, e, a, b, c);
+	step40_59(43, c, d, e, a, b);
+	step40_59(44, b, c, d, e, a);
+	step40_59(45, a, b, c, d, e);
+	step40_59(46, e, a, b, c, d);
+	step40_59(47, d, e, a, b, c);
+	step40_59(48, c, d, e, a, b);
+	step40_59(49, b, c, d, e, a);
+	step40_59(50, a, b, c, d, e);
+	step40_59(51, e, a, b, c, d);
+	step40_59(52, d, e, a, b, c);
+	step40_59(53, c, d, e, a, b);
+	step40_59(54, b, c, d, e, a);
+	step40_59(55, a, b, c, d, e);
+	step40_59(56, e, a, b, c, d);
+	step40_59(57, d, e, a, b, c);
+	step40_59(58, c, d, e, a, b);
+	step40_59(59, b, c, d, e, a);
+
+	step60_79(60, a, b, c, d, e);
+	step60_79(61, e, a, b, c, d);
+	step60_79(62, d, e, a, b, c);
+	step60_79(63, c, d, e, a, b);
+	step60_79(64, b, c, d, e, a);
+	step60_79(65, a, b, c, d, e);
+	step60_79(66, e, a, b, c, d);
+	step60_79(67, d, e, a, b, c);
+	step60_79(68, c, d, e, a, b);
+	step60_79(69, b, c, d, e, a);
+	step60_79(70, a, b, c, d, e);
+	step60_79(71, e, a, b, c, d);
+	step60_79(72, d, e, a, b, c);
+	step60_79(73, c, d, e, a, b);
+	step60_79(74, b, c, d, e, a);
+	step60_79(75, a, b, c, d, e);
+	step60_79(76, e, a, b, c, d);
+	step60_79(77, d, e, a, b, c);
+	step60_79(78, c, d, e, a, b);
+	step60_79(79, b, c, d, e, a);
+
+	digest[0] += a;
+	digest[1] += b;
+	digest[2] += c;
+	digest[3] += d;
+	digest[4] += e;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am
new file mode 100644
index 000000000..9405c2469
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am
@@ -0,0 +1,127 @@
+########################################################################
+#  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_x86_64 += 	sha256_mb/sha256_ctx_sse.c \
+		sha256_mb/sha256_ctx_avx.c \
+		sha256_mb/sha256_ctx_avx2.c \
+		sha256_mb/sha256_ctx_base.c
+
+lsrc_x86_64 += 	sha256_mb/sha256_mb_mgr_init_sse.c \
+		sha256_mb/sha256_mb_mgr_init_avx2.c
+
+
+lsrc_x86_64 += 	sha256_mb/sha256_mb_mgr_submit_sse.asm \
+		sha256_mb/sha256_mb_mgr_submit_avx.asm \
+		sha256_mb/sha256_mb_mgr_submit_avx2.asm \
+		sha256_mb/sha256_mb_mgr_flush_sse.asm \
+		sha256_mb/sha256_mb_mgr_flush_avx.asm \
+		sha256_mb/sha256_mb_mgr_flush_avx2.asm \
+		sha256_mb/sha256_mb_x4_sse.asm \
+		sha256_mb/sha256_mb_x4_avx.asm \
+		sha256_mb/sha256_mb_x8_avx2.asm \
+		sha256_mb/sha256_multibinary.asm
+
+lsrc_x86_64 += 	sha256_mb/sha256_ctx_avx512.c \
+		sha256_mb/sha256_mb_mgr_init_avx512.c \
+		sha256_mb/sha256_mb_mgr_submit_avx512.asm \
+		sha256_mb/sha256_mb_mgr_flush_avx512.asm \
+		sha256_mb/sha256_mb_x16_avx512.asm
+
+lsrc_x86_64 += 	sha256_mb/sha256_opt_x1.asm
+
+lsrc_x86_64 += 	sha256_mb/sha256_ni_x1.asm \
+		sha256_mb/sha256_ni_x2.asm \
+		sha256_mb/sha256_ctx_sse_ni.c \
+		sha256_mb/sha256_ctx_avx512_ni.c \
+		sha256_mb/sha256_mb_mgr_submit_sse_ni.asm \
+		sha256_mb/sha256_mb_mgr_flush_sse_ni.asm \
+		sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm
+
+lsrc_x86_32 += 	$(lsrc_x86_64)
+
+lsrc_aarch64 += sha256_mb/sha256_ctx_base.c \
+		sha256_mb/sha256_ref.c
+
+lsrc_aarch64 += sha256_mb/aarch64/sha256_mb_multibinary.S \
+		sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c  \
+		sha256_mb/aarch64/sha256_ctx_ce.c	\
+		sha256_mb/aarch64/sha256_mb_mgr_ce.c	\
+		sha256_mb/aarch64/sha256_mb_x1_ce.S	\
+		sha256_mb/aarch64/sha256_mb_x2_ce.S	\
+		sha256_mb/aarch64/sha256_mb_x3_ce.S	\
+		sha256_mb/aarch64/sha256_mb_x4_ce.S
+
+
+lsrc_base_aliases += sha256_mb/sha256_ctx_base_aliases.c	\
+		sha256_mb/sha256_ctx_base.c	\
+		sha256_mb/sha256_ref.c
+
+src_include += -I $(srcdir)/sha256_mb
+
+extern_hdrs +=  include/sha256_mb.h \
+		include/multi_buffer.h
+
+other_src += 	include/datastruct.asm \
+		include/multibinary.asm \
+		sha256_mb/sha256_job.asm \
+		sha256_mb/sha256_mb_mgr_datastruct.asm \
+		include/reg_sizes.asm \
+		sha256_mb/sha256_ref.c \
+		include/memcpy_inline.h \
+		include/memcpy.asm \
+		include/intrinreg.h
+
+check_tests  +=	sha256_mb/sha256_mb_test  \
+		sha256_mb/sha256_mb_rand_test  \
+		sha256_mb/sha256_mb_rand_update_test \
+		sha256_mb/sha256_mb_flush_test
+
+unit_tests   += sha256_mb/sha256_mb_rand_ssl_test
+
+perf_tests  +=  sha256_mb/sha256_mb_vs_ossl_perf \
+		sha256_mb/sha256_mb_vs_ossl_shortage_perf
+
+sha256_mb_rand_ssl_test: sha256_ref.o
+sha256_mb_rand_test: sha256_ref.o
+sha256_mb_sha256_mb_rand_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la
+
+sha256_mb_rand_update_test: sha256_ref.o
+sha256_mb_sha256_mb_rand_update_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la
+
+sha256_mb_flush_test: sha256_ref.o
+sha256_mb_sha256_mb_flush_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la
+
+sha256_mb_rand_ssl_test: LDLIBS += -lcrypto
+sha256_mb_sha256_mb_rand_ssl_test_LDFLAGS = -lcrypto
+
+sha256_mb_vs_ossl_perf: LDLIBS += -lcrypto
+sha256_mb_sha256_mb_vs_ossl_perf_LDFLAGS = -lcrypto
+
+sha256_mb_vs_ossl_shortage_perf: LDLIBS += -lcrypto
+sha256_mb_sha256_mb_vs_ossl_shortage_perf_LDFLAGS = -lcrypto
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c
new file mode 100644
index 000000000..4776f55bd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c
@@ -0,0 +1,256 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+void sha256_mb_mgr_init_ce(SHA256_MB_JOB_MGR * state);
+SHA256_JOB *sha256_mb_mgr_submit_ce(SHA256_MB_JOB_MGR * state, SHA256_JOB * job);
+SHA256_JOB *sha256_mb_mgr_flush_ce(SHA256_MB_JOB_MGR * state);
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_ce(SHA256_HASH_CTX_MGR * mgr)
+{
+	sha256_mb_mgr_init_ce(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_ce(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+					  const void *buffer, uint32_t len,
+					  HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_fixedlen(&ctx->partial_block_buffer
+					[ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx =
+			    (SHA256_HASH_CTX *) sha256_mb_mgr_submit_ce(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_ce(SHA256_HASH_CTX_MGR * mgr)
+{
+	SHA256_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_ce(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx)
+{
+	while (ctx) {
+
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_fixedlen(ctx->partial_block_buffer,
+						((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA256_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA256_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_ce(&mgr->mgr,
+										  &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx =
+			    (SHA256_HASH_CTX *) sha256_mb_mgr_submit_ce(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+	static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+	    { SHA256_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA256_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_ce_slver_02020142;
+struct slver sha256_ctx_mgr_init_ce_slver = { 0x0142, 0x02, 0x02 };
+
+struct slver sha256_ctx_mgr_submit_ce_slver_02020143;
+struct slver sha256_ctx_mgr_submit_ce_slver = { 0x0143, 0x02, 0x02 };
+
+struct slver sha256_ctx_mgr_flush_ce_slver_02020144;
+struct slver sha256_ctx_mgr_flush_ce_slver = { 0x0144, 0x02, 0x02 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c
new file mode 100644
index 000000000..8627991c3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c
@@ -0,0 +1,59 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(sha256_ctx_mgr_submit)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SHA2)
+		return PROVIDER_INFO(sha256_ctx_mgr_submit_ce);
+
+	return PROVIDER_BASIC(sha256_ctx_mgr_submit);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sha256_ctx_mgr_init)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SHA2)
+		return PROVIDER_INFO(sha256_ctx_mgr_init_ce);
+
+	return PROVIDER_BASIC(sha256_ctx_mgr_init);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sha256_ctx_mgr_flush)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SHA2)
+		return PROVIDER_INFO(sha256_ctx_mgr_flush_ce);
+
+	return PROVIDER_BASIC(sha256_ctx_mgr_flush);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c
new file mode 100644
index 000000000..aa63c4dd8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c
@@ -0,0 +1,254 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <sha256_mb.h>
+#include <assert.h>
+
+#ifndef max
+#define max(a,b)            (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b)            (((a) < (b)) ? (a) : (b))
+#endif
+
+#define SHA256_MB_CE_MAX_LANES	3
+
+#if SHA256_MB_CE_MAX_LANES >=4
+void sha256_mb_ce_x4(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int);
+#endif
+#if SHA256_MB_CE_MAX_LANES >=3
+void sha256_mb_ce_x3(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int);
+#endif
+#if SHA256_MB_CE_MAX_LANES >=2
+void sha256_mb_ce_x2(SHA256_JOB *, SHA256_JOB *, int);
+#endif
+void sha256_mb_ce_x1(SHA256_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i)  	\
+	(((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i)  	\
+	(((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define	LANE_IS_FREE(state,i)		\
+	(((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i)	\
+	(((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+void sha256_mb_mgr_init_ce(SHA256_MB_JOB_MGR * state)
+{
+	int i;
+
+	state->unused_lanes = 0xf;
+	state->num_lanes_inuse = 0;
+	for (i = SHA256_MB_CE_MAX_LANES - 1; i >= 0; i--) {
+		state->unused_lanes <<= 4;
+		state->unused_lanes |= i;
+		state->lens[i] = i;
+		state->ldata[i].job_in_lane = 0;
+	}
+
+	//lanes > SHA1_MB_CE_MAX_LANES is invalid lane
+	for (i = SHA256_MB_CE_MAX_LANES; i < SHA256_MAX_LANES; i++) {
+		state->lens[i] = 0xf;
+		state->ldata[i].job_in_lane = 0;
+	}
+}
+
+static int sha256_mb_mgr_do_jobs(SHA256_MB_JOB_MGR * state)
+{
+	int lane_idx, len, i, lanes;
+
+	int lane_idx_array[SHA256_MAX_LANES];
+
+	if (state->num_lanes_inuse == 0) {
+		return -1;
+	}
+#if SHA256_MB_CE_MAX_LANES == 4
+	if (state->num_lanes_inuse == 4) {
+		len = min(min(state->lens[0], state->lens[1]),
+			  min(state->lens[2], state->lens[3]));
+		lane_idx = len & 0xf;
+		len &= ~0xf;
+
+		sha256_mb_ce_x4(state->ldata[0].job_in_lane,
+				state->ldata[1].job_in_lane,
+				state->ldata[2].job_in_lane,
+				state->ldata[3].job_in_lane, len >> 4);
+
+	} else
+#elif SHA256_MB_CE_MAX_LANES == 3
+	if (state->num_lanes_inuse == 3) {
+		len = min(min(state->lens[0], state->lens[1]), state->lens[2]);
+		lane_idx = len & 0xf;
+		len &= ~0xf;
+
+		sha256_mb_ce_x3(state->ldata[0].job_in_lane,
+				state->ldata[1].job_in_lane,
+				state->ldata[2].job_in_lane, len >> 4);
+
+	} else
+#elif	SHA256_MB_CE_MAX_LANES == 2
+	if (state->num_lanes_inuse == 2) {
+		len = min(state->lens[0], state->lens[1]);
+		lane_idx = len & 0xf;
+		len &= ~0xf;
+
+		sha256_mb_ce_x2(state->ldata[0].job_in_lane,
+				state->ldata[1].job_in_lane, len >> 4);
+
+	} else
+#endif
+	{
+		lanes = 0, len = 0;
+		for (i = 0; i < SHA256_MAX_LANES && lanes < state->num_lanes_inuse; i++) {
+			if (LANE_IS_NOT_FINISHED(state, i)) {
+				if (lanes)
+					len = min(len, state->lens[i]);
+				else
+					len = state->lens[i];
+				lane_idx_array[lanes] = i;
+				lanes++;
+			}
+		}
+		if (lanes == 0)
+			return -1;
+		lane_idx = len & 0xf;
+		len = len & (~0xf);
+#if SHA256_MB_CE_MAX_LANES >=4
+		if (lanes == 4) {
+			sha256_mb_ce_x4(state->ldata[lane_idx_array[0]].job_in_lane,
+					state->ldata[lane_idx_array[1]].job_in_lane,
+					state->ldata[lane_idx_array[2]].job_in_lane,
+					state->ldata[lane_idx_array[3]].job_in_lane, len >> 4);
+
+		} else
+#endif
+#if SHA256_MB_CE_MAX_LANES >=3
+		if (lanes == 3) {
+			sha256_mb_ce_x3(state->ldata[lane_idx_array[0]].job_in_lane,
+					state->ldata[lane_idx_array[1]].job_in_lane,
+					state->ldata[lane_idx_array[2]].job_in_lane, len >> 4);
+		} else
+#endif
+#if SHA256_MB_CE_MAX_LANES >=2
+		if (lanes == 2) {
+			sha256_mb_ce_x2(state->ldata[lane_idx_array[0]].job_in_lane,
+					state->ldata[lane_idx_array[1]].job_in_lane, len >> 4);
+		} else
+#endif
+		{
+			sha256_mb_ce_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4);
+		}
+	}
+	//only return the min length job
+	for (i = 0; i < SHA256_MAX_LANES; i++) {
+		if (LANE_IS_NOT_FINISHED(state, i)) {
+			state->lens[i] -= len;
+			state->ldata[i].job_in_lane->len -= len;
+			state->ldata[i].job_in_lane->buffer += len << 2;
+		}
+	}
+
+	return lane_idx;
+
+}
+
+static SHA256_JOB *sha256_mb_mgr_free_lane(SHA256_MB_JOB_MGR * state)
+{
+	int i;
+	SHA256_JOB *ret = NULL;
+
+	for (i = 0; i < SHA256_MB_CE_MAX_LANES; i++) {
+		if (LANE_IS_FINISHED(state, i)) {
+
+			state->unused_lanes <<= 4;
+			state->unused_lanes |= i;
+			state->num_lanes_inuse--;
+			ret = state->ldata[i].job_in_lane;
+			ret->status = STS_COMPLETED;
+			state->ldata[i].job_in_lane = NULL;
+			break;
+		}
+	}
+	return ret;
+}
+
+static void sha256_mb_mgr_insert_job(SHA256_MB_JOB_MGR * state, SHA256_JOB * job)
+{
+	int lane_idx;
+	//add job into lanes
+	lane_idx = state->unused_lanes & 0xf;
+	//fatal error
+	assert(lane_idx < SHA256_MB_CE_MAX_LANES);
+	state->lens[lane_idx] = (job->len << 4) | lane_idx;
+	state->ldata[lane_idx].job_in_lane = job;
+	state->unused_lanes >>= 4;
+	state->num_lanes_inuse++;
+}
+
+SHA256_JOB *sha256_mb_mgr_submit_ce(SHA256_MB_JOB_MGR * state, SHA256_JOB * job)
+{
+#ifndef NDEBUG
+	int lane_idx;
+#endif
+	SHA256_JOB *ret;
+
+	//add job into lanes
+	sha256_mb_mgr_insert_job(state, job);
+
+	ret = sha256_mb_mgr_free_lane(state);
+	if (ret != NULL) {
+		return ret;
+	}
+	//submit will wait all lane has data
+	if (state->num_lanes_inuse < SHA256_MB_CE_MAX_LANES)
+		return NULL;
+#ifndef NDEBUG
+	lane_idx = sha256_mb_mgr_do_jobs(state);
+	assert(lane_idx != -1);
+#else
+	sha256_mb_mgr_do_jobs(state);
+#endif
+
+	//~ i = lane_idx;
+	ret = sha256_mb_mgr_free_lane(state);
+	return ret;
+}
+
+SHA256_JOB *sha256_mb_mgr_flush_ce(SHA256_MB_JOB_MGR * state)
+{
+	SHA256_JOB *ret;
+	ret = sha256_mb_mgr_free_lane(state);
+	if (ret) {
+		return ret;
+	}
+
+	sha256_mb_mgr_do_jobs(state);
+	return sha256_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S
new file mode 100644
index 000000000..ecc5fc5f5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S
@@ -0,0 +1,36 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include <aarch64_multibinary.h>
+
+
+mbin_interface sha256_ctx_mgr_submit
+mbin_interface sha256_ctx_mgr_init
+mbin_interface sha256_ctx_mgr_flush
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S
new file mode 100644
index 000000000..06d0ab5fa
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S
@@ -0,0 +1,238 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+	.align	2
+	.p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro	declare_var_vector_reg name:req,reg:req
+	\name\()_q	.req	q\reg
+	\name\()_v	.req	v\reg
+	\name\()_s	.req	s\reg
+.endm
+/**
+maros for round 48-63
+*/
+.macro sha256_4_rounds_high msg:req,tmp0:req,tmp1:req
+	ldr		key_q , [tmp]
+	mov		l0_tmp2_v.16b,l0_abcd_v.16b
+	add		tmp,tmp,16
+	add		l0_\tmp1\()_v.4s,l0_\msg\()_v.4s,key_v.4s
+	sha256h		l0_abcd_q,l0_efgh_q,l0_\tmp0\()_v.4s
+	sha256h2	l0_efgh_q,l0_tmp2_q,l0_\tmp0\()_v.4s
+
+.endm
+/**
+maros for round 0-47
+*/
+.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req,tmp1:req
+	sha256su0		l0_\msg0\()_v.4s,l0_\msg1\()_v.4s
+	sha256_4_rounds_high	\msg1,\tmp0,\tmp1
+	sha256su1		l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+	declare_var_vector_reg	key,31
+
+
+/*
+digest variables
+*/
+	declare_var_vector_reg	l0_abcd,0
+	declare_var_vector_reg	l0_efgh,1
+	declare_var_vector_reg	l0_abcd_saved,5
+	declare_var_vector_reg	l0_efgh_saved,6
+/*
+Temporay variables
+*/
+	declare_var_vector_reg	l0_tmp0,2
+	declare_var_vector_reg	l0_tmp1,3
+	declare_var_vector_reg	l0_tmp2,4
+/*
+Message variables
+*/
+	declare_var_vector_reg	l0_msg0,16
+	declare_var_vector_reg	l0_msg1,17
+	declare_var_vector_reg	l0_msg2,18
+	declare_var_vector_reg	l0_msg3,19
+
+
+
+/*
+	void sha256_mb_ce_x1(SHA1_JOB * l0_job, int len);
+*/
+/*
+Arguements list
+*/
+	l0_job 	.req	x0
+	len	.req	w1
+	l0_data	.req	x2
+	tmp	.req	x3
+	.global	sha256_mb_ce_x1
+	.type	sha256_mb_ce_x1, %function
+sha256_mb_ce_x1:
+	ldr	l0_data, [l0_job]
+	ldr	l0_abcd_q, [l0_job, 64]
+	ldr	l0_efgh_q, [l0_job, 80]
+
+
+
+start_loop:
+	adr	tmp, KEY
+	//load msgs
+	ld1	{l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+	ldr	key_q,[tmp]
+	add	tmp,tmp,16
+	//adjust loop parameter
+	add	l0_data,l0_data,64
+	sub	len, len, #1
+	cmp	len, 0
+	//backup digest
+	mov	l0_abcd_saved_v.16b,l0_abcd_v.16b
+	mov	l0_efgh_saved_v.16b,l0_efgh_v.16b
+
+	rev32	l0_msg0_v.16b,l0_msg0_v.16b
+	rev32	l0_msg1_v.16b,l0_msg1_v.16b
+	add	l0_tmp0_v.4s,l0_msg0_v.4s,key_v.4s
+	rev32	l0_msg2_v.16b,l0_msg2_v.16b
+	rev32	l0_msg3_v.16b,l0_msg3_v.16b
+
+
+
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0,tmp1    /* rounds 0-3 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp1,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0,tmp1
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp1,tmp0
+
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0,tmp1    /* rounds 16-19 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp1,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0,tmp1
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp1,tmp0
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0,tmp1    /* rounds 32-35 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp1,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0,tmp1
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp1,tmp0
+
+	sha256_4_rounds_high	msg1,tmp0,tmp1			/* rounds 48-51 */
+	sha256_4_rounds_high	msg2,tmp1,tmp0
+	sha256_4_rounds_high	msg3,tmp0,tmp1
+
+	/* rounds 60-63 */
+	mov		l0_tmp2_v.16b,l0_abcd_v.16b
+	sha256h		l0_abcd_q,l0_efgh_q,l0_tmp1_v.4s
+	sha256h2	l0_efgh_q,l0_tmp2_q,l0_tmp1_v.4s
+
+
+
+	add     l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s
+	add     l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s
+
+
+	bgt	start_loop
+	str	l0_abcd_q,	[l0_job, 64]
+	str	l0_efgh_q, 	[l0_job, 80]
+
+	ret
+
+	.size	sha256_mb_ce_x1, .-sha256_mb_ce_x1
+	.section	.rol0_data.cst16,"aM",@progbits,16
+	.align	4
+KEY:
+	.word 0x428A2F98
+	.word 0x71374491
+	.word 0xB5C0FBCF
+	.word 0xE9B5DBA5
+	.word 0x3956C25B
+	.word 0x59F111F1
+	.word 0x923F82A4
+	.word 0xAB1C5ED5
+	.word 0xD807AA98
+	.word 0x12835B01
+	.word 0x243185BE
+	.word 0x550C7DC3
+	.word 0x72BE5D74
+	.word 0x80DEB1FE
+	.word 0x9BDC06A7
+	.word 0xC19BF174
+	.word 0xE49B69C1
+	.word 0xEFBE4786
+	.word 0x0FC19DC6
+	.word 0x240CA1CC
+	.word 0x2DE92C6F
+	.word 0x4A7484AA
+	.word 0x5CB0A9DC
+	.word 0x76F988DA
+	.word 0x983E5152
+	.word 0xA831C66D
+	.word 0xB00327C8
+	.word 0xBF597FC7
+	.word 0xC6E00BF3
+	.word 0xD5A79147
+	.word 0x06CA6351
+	.word 0x14292967
+	.word 0x27B70A85
+	.word 0x2E1B2138
+	.word 0x4D2C6DFC
+	.word 0x53380D13
+	.word 0x650A7354
+	.word 0x766A0ABB
+	.word 0x81C2C92E
+	.word 0x92722C85
+	.word 0xA2BFE8A1
+	.word 0xA81A664B
+	.word 0xC24B8B70
+	.word 0xC76C51A3
+	.word 0xD192E819
+	.word 0xD6990624
+	.word 0xF40E3585
+	.word 0x106AA070
+	.word 0x19A4C116
+	.word 0x1E376C08
+	.word 0x2748774C
+	.word 0x34B0BCB5
+	.word 0x391C0CB3
+	.word 0x4ED8AA4A
+	.word 0x5B9CCA4F
+	.word 0x682E6FF3
+	.word 0x748F82EE
+	.word 0x78A5636F
+	.word 0x84C87814
+	.word 0x8CC70208
+	.word 0x90BEFFFA
+	.word 0xA4506CEB
+	.word 0xBEF9A3F7
+	.word 0xC67178F2
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S
new file mode 100644
index 000000000..dadf44bb0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S
@@ -0,0 +1,289 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+	.align	2
+	.p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro	declare_var_vector_reg name:req,reg:req
+	\name\()_q	.req	q\reg
+	\name\()_v	.req	v\reg
+	\name\()_s	.req	s\reg
+.endm
+/**
+maros for round 48-63
+*/
+.macro sha256_4_rounds_high msg:req,tmp0:req,tmp1:req
+	ldr		key_q , [tmp]
+	mov		l0_tmp2_v.16b,l0_abcd_v.16b
+	mov		l1_tmp2_v.16b,l1_abcd_v.16b
+	add		tmp,tmp,16
+	add		l0_\tmp1\()_v.4s,l0_\msg\()_v.4s,key_v.4s
+	add		l1_\tmp1\()_v.4s,l1_\msg\()_v.4s,key_v.4s
+	sha256h		l0_abcd_q,l0_efgh_q,l0_\tmp0\()_v.4s
+	sha256h		l1_abcd_q,l1_efgh_q,l1_\tmp0\()_v.4s
+	sha256h2	l0_efgh_q,l0_tmp2_q,l0_\tmp0\()_v.4s
+	sha256h2	l1_efgh_q,l1_tmp2_q,l1_\tmp0\()_v.4s
+
+.endm
+/**
+maros for round 0-47
+*/
+.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req,tmp1:req
+	sha256su0		l0_\msg0\()_v.4s,l0_\msg1\()_v.4s
+	sha256su0		l1_\msg0\()_v.4s,l1_\msg1\()_v.4s
+	sha256_4_rounds_high	\msg1,\tmp0,\tmp1
+	sha256su1		l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+	sha256su1		l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+	declare_var_vector_reg	key,31
+
+
+/*
+digest variables
+*/
+	declare_var_vector_reg	l0_abcd,0
+	declare_var_vector_reg	l0_efgh,1
+	declare_var_vector_reg	l0_abcd_saved,2
+	declare_var_vector_reg	l0_efgh_saved,3
+	declare_var_vector_reg	l1_abcd,4
+	declare_var_vector_reg	l1_efgh,5
+	declare_var_vector_reg	l1_abcd_saved,6
+	declare_var_vector_reg	l1_efgh_saved,7
+/*
+Temporay variables
+*/
+	declare_var_vector_reg	l0_tmp0,8
+	declare_var_vector_reg	l0_tmp1,9
+	declare_var_vector_reg	l0_tmp2,10
+	declare_var_vector_reg	l1_tmp0,11
+	declare_var_vector_reg	l1_tmp1,12
+	declare_var_vector_reg	l1_tmp2,13
+/*
+Message variables
+*/
+	declare_var_vector_reg	l0_msg0,16
+	declare_var_vector_reg	l0_msg1,17
+	declare_var_vector_reg	l0_msg2,18
+	declare_var_vector_reg	l0_msg3,19
+	declare_var_vector_reg	l1_msg0,20
+	declare_var_vector_reg	l1_msg1,21
+	declare_var_vector_reg	l1_msg2,22
+	declare_var_vector_reg	l1_msg3,23
+
+
+
+/*
+	void sha256_mb_ce_x2(SHA256_JOB *, SHA256_JOB *, int);
+*/
+/*
+Arguements list
+*/
+	l0_job 	.req	x0
+	l1_job 	.req	x1
+	len	.req	w2
+	l0_data	.req	x3
+	l1_data	.req	x4
+	tmp	.req	x5
+	.global	sha256_mb_ce_x2
+	.type	sha256_mb_ce_x2, %function
+sha256_mb_ce_x2:
+	//push d8~d15
+	stp 	d8,d9,[sp,-192]!
+	stp 	d10,d11,[sp,16]
+	stp 	d12,d13,[sp,32]
+	stp 	d14,d15,[sp,48]
+	ldr	l0_data, [l0_job]
+	ldr	l0_abcd_q, [l0_job, 64]
+	ldr	l0_efgh_q, [l0_job, 80]
+	ldr	l1_data,   [l1_job]
+	ldr	l1_abcd_q, [l1_job, 64]
+	ldr	l1_efgh_q, [l1_job, 80]
+
+
+
+start_loop:
+
+	//load key addr
+	adr	tmp, KEY
+	//load msgs
+	ld1	{l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+	ld1	{l1_msg0_v.4s-l1_msg3_v.4s},[l1_data]
+	ldr	key_q,[tmp]
+	add	tmp,tmp,16
+	//adjust loop parameter
+	add	l0_data,l0_data,64
+	add	l1_data,l1_data,64
+	sub	len, len, #1
+	cmp	len, 0
+	//backup digest
+	mov	l0_abcd_saved_v.16b,l0_abcd_v.16b
+	mov	l0_efgh_saved_v.16b,l0_efgh_v.16b
+	mov	l1_abcd_saved_v.16b,l1_abcd_v.16b
+	mov	l1_efgh_saved_v.16b,l1_efgh_v.16b
+
+	rev32	l0_msg0_v.16b,l0_msg0_v.16b
+	rev32	l0_msg1_v.16b,l0_msg1_v.16b
+	add	l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s
+	rev32	l0_msg2_v.16b,l0_msg2_v.16b
+	rev32	l0_msg3_v.16b,l0_msg3_v.16b
+
+	rev32	l1_msg0_v.16b,l1_msg0_v.16b
+	rev32	l1_msg1_v.16b,l1_msg1_v.16b
+	add	l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s
+	rev32	l1_msg2_v.16b,l1_msg2_v.16b
+	rev32	l1_msg3_v.16b,l1_msg3_v.16b
+
+
+
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0,tmp1    /* rounds 0-3 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp1,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0,tmp1
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp1,tmp0
+
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0,tmp1    /* rounds 16-19 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp1,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0,tmp1
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp1,tmp0
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0,tmp1    /* rounds 32-35 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp1,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0,tmp1
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp1,tmp0
+
+	sha256_4_rounds_high	msg1,tmp0,tmp1			/* rounds 48-51 */
+	sha256_4_rounds_high	msg2,tmp1,tmp0
+	sha256_4_rounds_high	msg3,tmp0,tmp1
+
+	/* rounds 60-63 */
+	mov		l0_tmp2_v.16b,l0_abcd_v.16b
+	sha256h		l0_abcd_q,l0_efgh_q,l0_tmp1_v.4s
+	sha256h2	l0_efgh_q,l0_tmp2_q,l0_tmp1_v.4s
+
+	mov		l1_tmp2_v.16b,l1_abcd_v.16b
+	sha256h		l1_abcd_q,l1_efgh_q,l1_tmp1_v.4s
+	sha256h2	l1_efgh_q,l1_tmp2_q,l1_tmp1_v.4s
+
+
+
+	add     l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s
+	add     l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s
+	add     l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s
+	add     l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s
+
+
+	bgt	start_loop
+	str	l0_abcd_q,	[l0_job, 64]
+	str	l0_efgh_q, 	[l0_job, 80]
+	str	l1_abcd_q,	[l1_job, 64]
+	str	l1_efgh_q, 	[l1_job, 80]
+
+	ldp 	d10,d11,[sp,16]
+	ldp 	d12,d13,[sp,32]
+	ldp 	d14,d15,[sp,48]
+	ldp     d8, d9, [sp], 192
+	ret
+
+	.size	sha256_mb_ce_x2, .-sha256_mb_ce_x2
+	.section	.rol0_data.cst16,"aM",@progbits,16
+	.align	4
+KEY:
+	.word 0x428A2F98
+	.word 0x71374491
+	.word 0xB5C0FBCF
+	.word 0xE9B5DBA5
+	.word 0x3956C25B
+	.word 0x59F111F1
+	.word 0x923F82A4
+	.word 0xAB1C5ED5
+	.word 0xD807AA98
+	.word 0x12835B01
+	.word 0x243185BE
+	.word 0x550C7DC3
+	.word 0x72BE5D74
+	.word 0x80DEB1FE
+	.word 0x9BDC06A7
+	.word 0xC19BF174
+	.word 0xE49B69C1
+	.word 0xEFBE4786
+	.word 0x0FC19DC6
+	.word 0x240CA1CC
+	.word 0x2DE92C6F
+	.word 0x4A7484AA
+	.word 0x5CB0A9DC
+	.word 0x76F988DA
+	.word 0x983E5152
+	.word 0xA831C66D
+	.word 0xB00327C8
+	.word 0xBF597FC7
+	.word 0xC6E00BF3
+	.word 0xD5A79147
+	.word 0x06CA6351
+	.word 0x14292967
+	.word 0x27B70A85
+	.word 0x2E1B2138
+	.word 0x4D2C6DFC
+	.word 0x53380D13
+	.word 0x650A7354
+	.word 0x766A0ABB
+	.word 0x81C2C92E
+	.word 0x92722C85
+	.word 0xA2BFE8A1
+	.word 0xA81A664B
+	.word 0xC24B8B70
+	.word 0xC76C51A3
+	.word 0xD192E819
+	.word 0xD6990624
+	.word 0xF40E3585
+	.word 0x106AA070
+	.word 0x19A4C116
+	.word 0x1E376C08
+	.word 0x2748774C
+	.word 0x34B0BCB5
+	.word 0x391C0CB3
+	.word 0x4ED8AA4A
+	.word 0x5B9CCA4F
+	.word 0x682E6FF3
+	.word 0x748F82EE
+	.word 0x78A5636F
+	.word 0x84C87814
+	.word 0x8CC70208
+	.word 0x90BEFFFA
+	.word 0xA4506CEB
+	.word 0xBEF9A3F7
+	.word 0xC67178F2
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S
new file mode 100644
index 000000000..6ed1591ba
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S
@@ -0,0 +1,342 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+	.align	2
+	.p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro	declare_var_vector_reg name:req,reg:req
+	\name\()_q	.req	q\reg
+	\name\()_v	.req	v\reg
+	\name\()_s	.req	s\reg
+.endm
+/**
+maros for round 48-63
+*/
+.macro sha256_4_rounds_high msg:req,tmp0:req,tmp1:req
+	ldr		key_q , [tmp]
+	mov		l0_tmp2_v.16b,l0_abcd_v.16b
+	mov		l1_tmp2_v.16b,l1_abcd_v.16b
+	mov		l2_tmp2_v.16b,l2_abcd_v.16b
+	add		tmp,tmp,16
+	add		l0_\tmp1\()_v.4s,l0_\msg\()_v.4s,key_v.4s
+	add		l1_\tmp1\()_v.4s,l1_\msg\()_v.4s,key_v.4s
+	add		l2_\tmp1\()_v.4s,l2_\msg\()_v.4s,key_v.4s
+	sha256h		l0_abcd_q,l0_efgh_q,l0_\tmp0\()_v.4s
+	sha256h		l1_abcd_q,l1_efgh_q,l1_\tmp0\()_v.4s
+	sha256h		l2_abcd_q,l2_efgh_q,l2_\tmp0\()_v.4s
+	sha256h2	l0_efgh_q,l0_tmp2_q,l0_\tmp0\()_v.4s
+	sha256h2	l1_efgh_q,l1_tmp2_q,l1_\tmp0\()_v.4s
+	sha256h2	l2_efgh_q,l2_tmp2_q,l2_\tmp0\()_v.4s
+
+.endm
+/**
+maros for round 0-47
+*/
+.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req,tmp1:req
+	sha256su0		l0_\msg0\()_v.4s,l0_\msg1\()_v.4s
+	sha256su0		l1_\msg0\()_v.4s,l1_\msg1\()_v.4s
+	sha256su0		l2_\msg0\()_v.4s,l2_\msg1\()_v.4s
+	sha256_4_rounds_high	\msg1,\tmp0,\tmp1
+	sha256su1		l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+	sha256su1		l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s
+	sha256su1		l2_\msg0\()_v.4s,l2_\msg2\()_v.4s,l2_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+	declare_var_vector_reg	key,31
+
+
+/*
+digest variables
+*/
+	declare_var_vector_reg	l0_abcd,0
+	declare_var_vector_reg	l0_efgh,1
+	declare_var_vector_reg	l1_abcd,2
+	declare_var_vector_reg	l1_efgh,3
+	declare_var_vector_reg	l2_abcd,4
+	declare_var_vector_reg	l2_efgh,5
+	declare_var_vector_reg	l1_abcd_saved,16
+	declare_var_vector_reg	l1_efgh_saved,17
+	declare_var_vector_reg	l0_abcd_saved,20
+	declare_var_vector_reg	l0_efgh_saved,21
+	declare_var_vector_reg	l2_abcd_saved,24
+	declare_var_vector_reg	l2_efgh_saved,25
+/*
+Temporay variables
+*/
+	declare_var_vector_reg	l0_tmp0,6
+	declare_var_vector_reg	l0_tmp1,7
+	declare_var_vector_reg	l0_tmp2,8
+	declare_var_vector_reg	l1_tmp0,9
+	declare_var_vector_reg	l1_tmp1,10
+	declare_var_vector_reg	l1_tmp2,11
+	declare_var_vector_reg	l2_tmp0,12
+	declare_var_vector_reg	l2_tmp1,13
+	declare_var_vector_reg	l2_tmp2,14
+/*
+Message variables
+*/
+	declare_var_vector_reg	l0_msg0,16
+	declare_var_vector_reg	l0_msg1,17
+	declare_var_vector_reg	l0_msg2,18
+	declare_var_vector_reg	l0_msg3,19
+	declare_var_vector_reg	l1_msg0,20
+	declare_var_vector_reg	l1_msg1,21
+	declare_var_vector_reg	l1_msg2,22
+	declare_var_vector_reg	l1_msg3,23
+	declare_var_vector_reg	l2_msg0,24
+	declare_var_vector_reg	l2_msg1,25
+	declare_var_vector_reg	l2_msg2,26
+	declare_var_vector_reg	l2_msg3,27
+
+
+
+/*
+	void sha256_mb_ce_x3(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int);
+*/
+/*
+Arguements list
+*/
+	l0_job 	.req	x0
+	l1_job 	.req	x1
+	l2_job 	.req	x2
+	len	.req	w3
+	l0_data	.req	x4
+	l1_data	.req	x5
+	l2_data	.req	x6
+	tmp	.req	x7
+	.global	sha256_mb_ce_x3
+	.type	sha256_mb_ce_x3, %function
+sha256_mb_ce_x3:
+	//push d8~d15
+	stp 	d8,d9,[sp,-192]!
+	stp 	d10,d11,[sp,16]
+	stp 	d12,d13,[sp,32]
+	stp 	d14,d15,[sp,48]
+	ldr	l0_data, [l0_job]
+	ldr	l0_abcd_q, [l0_job, 64]
+	ldr	l0_efgh_q, [l0_job, 80]
+	ldr	l1_data,   [l1_job]
+	ldr	l1_abcd_q, [l1_job, 64]
+	ldr	l1_efgh_q, [l1_job, 80]
+	ldr	l2_data,   [l2_job]
+	ldr	l2_abcd_q, [l2_job, 64]
+	ldr	l2_efgh_q, [l2_job, 80]
+
+
+
+start_loop:
+
+	//load key addr
+	adr	tmp, KEY
+	//load msgs
+	ld1	{l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+	ld1	{l1_msg0_v.4s-l1_msg3_v.4s},[l1_data]
+	ld1	{l2_msg0_v.4s-l2_msg3_v.4s},[l2_data]
+	ldr	key_q,[tmp]
+	add	tmp,tmp,16
+	//adjust loop parameter
+	add	l0_data,l0_data,64
+	add	l1_data,l1_data,64
+	add	l2_data,l2_data,64
+	sub	len, len, #1
+	cmp	len, 0
+/*
+	//backup digest
+	mov	l0_abcd_saved_v.16b,l0_abcd_v.16b
+	mov	l0_efgh_saved_v.16b,l0_efgh_v.16b
+	mov	l1_abcd_saved_v.16b,l1_abcd_v.16b
+	mov	l1_efgh_saved_v.16b,l1_efgh_v.16b
+	mov	l2_abcd_saved_v.16b,l2_abcd_v.16b
+	mov	l2_efgh_saved_v.16b,l2_efgh_v.16b
+*/
+
+	rev32	l0_msg0_v.16b,l0_msg0_v.16b
+	rev32	l0_msg1_v.16b,l0_msg1_v.16b
+	add	l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s
+	rev32	l0_msg2_v.16b,l0_msg2_v.16b
+	rev32	l0_msg3_v.16b,l0_msg3_v.16b
+
+	rev32	l1_msg0_v.16b,l1_msg0_v.16b
+	rev32	l1_msg1_v.16b,l1_msg1_v.16b
+	add	l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s
+	rev32	l1_msg2_v.16b,l1_msg2_v.16b
+	rev32	l1_msg3_v.16b,l1_msg3_v.16b
+
+	rev32	l2_msg0_v.16b,l2_msg0_v.16b
+	rev32	l2_msg1_v.16b,l2_msg1_v.16b
+	add	l2_tmp0_v.4s, l2_msg0_v.4s,key_v.4s
+	rev32	l2_msg2_v.16b,l2_msg2_v.16b
+	rev32	l2_msg3_v.16b,l2_msg3_v.16b
+
+
+
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0,tmp1    /* rounds 0-3 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp1,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0,tmp1
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp1,tmp0
+
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0,tmp1    /* rounds 16-19 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp1,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0,tmp1
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp1,tmp0
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0,tmp1    /* rounds 32-35 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp1,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0,tmp1
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp1,tmp0
+
+
+
+	sha256_4_rounds_high	msg1,tmp0,tmp1			/* rounds 48-51 */
+
+	/* msg0 msg1 is free , share with digest regs */
+	ldr	l0_abcd_saved_q, [l0_job, 64]
+	ldr	l1_abcd_saved_q, [l1_job, 64]
+	ldr	l2_abcd_saved_q, [l2_job, 64]
+	ldr	l0_efgh_saved_q, [l0_job, 80]
+	ldr	l1_efgh_saved_q, [l1_job, 80]
+	ldr	l2_efgh_saved_q, [l2_job, 80]
+
+	sha256_4_rounds_high	msg2,tmp1,tmp0
+	sha256_4_rounds_high	msg3,tmp0,tmp1
+
+	/* rounds 60-63 */
+	mov		l0_tmp2_v.16b,l0_abcd_v.16b
+	sha256h		l0_abcd_q,l0_efgh_q,l0_tmp1_v.4s
+	sha256h2	l0_efgh_q,l0_tmp2_q,l0_tmp1_v.4s
+
+	mov		l1_tmp2_v.16b,l1_abcd_v.16b
+	sha256h		l1_abcd_q,l1_efgh_q,l1_tmp1_v.4s
+	sha256h2	l1_efgh_q,l1_tmp2_q,l1_tmp1_v.4s
+
+	mov		l2_tmp2_v.16b,l2_abcd_v.16b
+	sha256h		l2_abcd_q,l2_efgh_q,l2_tmp1_v.4s
+	sha256h2	l2_efgh_q,l2_tmp2_q,l2_tmp1_v.4s
+
+	/* combine state */
+	add     l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s
+	add     l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s
+	add     l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s
+	add     l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s
+	add     l2_abcd_v.4s,l2_abcd_v.4s,l2_abcd_saved_v.4s
+	add     l2_efgh_v.4s,l2_efgh_v.4s,l2_efgh_saved_v.4s
+
+	str	l0_abcd_q,	[l0_job, 64]
+	str	l0_efgh_q, 	[l0_job, 80]
+	str	l1_abcd_q,	[l1_job, 64]
+	str	l1_efgh_q, 	[l1_job, 80]
+	str	l2_abcd_q,	[l2_job, 64]
+	str	l2_efgh_q, 	[l2_job, 80]
+
+	bgt	start_loop
+
+
+	ldp 	d10,d11,[sp,16]
+	ldp 	d12,d13,[sp,32]
+	ldp 	d14,d15,[sp,48]
+	ldp     d8, d9, [sp], 192
+	ret
+
+	.size	sha256_mb_ce_x3, .-sha256_mb_ce_x3
+	.section	.rol0_data.cst16,"aM",@progbits,16
+	.align	4
+KEY:
+	.word 0x428A2F98
+	.word 0x71374491
+	.word 0xB5C0FBCF
+	.word 0xE9B5DBA5
+	.word 0x3956C25B
+	.word 0x59F111F1
+	.word 0x923F82A4
+	.word 0xAB1C5ED5
+	.word 0xD807AA98
+	.word 0x12835B01
+	.word 0x243185BE
+	.word 0x550C7DC3
+	.word 0x72BE5D74
+	.word 0x80DEB1FE
+	.word 0x9BDC06A7
+	.word 0xC19BF174
+	.word 0xE49B69C1
+	.word 0xEFBE4786
+	.word 0x0FC19DC6
+	.word 0x240CA1CC
+	.word 0x2DE92C6F
+	.word 0x4A7484AA
+	.word 0x5CB0A9DC
+	.word 0x76F988DA
+	.word 0x983E5152
+	.word 0xA831C66D
+	.word 0xB00327C8
+	.word 0xBF597FC7
+	.word 0xC6E00BF3
+	.word 0xD5A79147
+	.word 0x06CA6351
+	.word 0x14292967
+	.word 0x27B70A85
+	.word 0x2E1B2138
+	.word 0x4D2C6DFC
+	.word 0x53380D13
+	.word 0x650A7354
+	.word 0x766A0ABB
+	.word 0x81C2C92E
+	.word 0x92722C85
+	.word 0xA2BFE8A1
+	.word 0xA81A664B
+	.word 0xC24B8B70
+	.word 0xC76C51A3
+	.word 0xD192E819
+	.word 0xD6990624
+	.word 0xF40E3585
+	.word 0x106AA070
+	.word 0x19A4C116
+	.word 0x1E376C08
+	.word 0x2748774C
+	.word 0x34B0BCB5
+	.word 0x391C0CB3
+	.word 0x4ED8AA4A
+	.word 0x5B9CCA4F
+	.word 0x682E6FF3
+	.word 0x748F82EE
+	.word 0x78A5636F
+	.word 0x84C87814
+	.word 0x8CC70208
+	.word 0x90BEFFFA
+	.word 0xA4506CEB
+	.word 0xBEF9A3F7
+	.word 0xC67178F2
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S
new file mode 100644
index 000000000..b1686ada1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S
@@ -0,0 +1,380 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8-a+crypto
+	.text
+	.align	2
+	.p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro	declare_var_vector_reg name:req,reg:req
+	\name\()_q	.req	q\reg
+	\name\()_v	.req	v\reg
+	\name\()_s	.req	s\reg
+.endm
+/**
+maros for round 48-63
+tmp0 : in
+tmp1 : out
+*/
+.macro sha256_4_rounds_high msg:req,tmp0:req
+	ldr		key_q , [tmp]
+	mov		tmp0_v.16b,l0_\tmp0\()_v.16b
+	mov		tmp1_v.16b,l1_\tmp0\()_v.16b
+	add		l0_\tmp0\()_v.4s,l0_\msg\()_v.4s,key_v.4s
+	add		l1_\tmp0\()_v.4s,l1_\msg\()_v.4s,key_v.4s
+	mov		tmp2_v.16b,l0_abcd_v.16b
+	mov		tmp3_v.16b,l1_abcd_v.16b
+	sha256h		l0_abcd_q,l0_efgh_q,tmp0_v.4s
+	sha256h		l1_abcd_q,l1_efgh_q,tmp1_v.4s
+	sha256h2	l0_efgh_q,tmp2_q,tmp0_v.4s
+	sha256h2	l1_efgh_q,tmp3_q,tmp1_v.4s
+
+	ldr		key_q , [tmp]
+	mov		tmp0_v.16b,l2_\tmp0\()_v.16b
+	mov		tmp1_v.16b,l3_\tmp0\()_v.16b
+	add		tmp,tmp,16
+	add		l2_\tmp0\()_v.4s,l2_\msg\()_v.4s,key_v.4s
+	add		l3_\tmp0\()_v.4s,l3_\msg\()_v.4s,key_v.4s
+	mov		tmp2_v.16b,l2_abcd_v.16b
+	mov		tmp3_v.16b,l3_abcd_v.16b
+	sha256h		l2_abcd_q,l2_efgh_q,tmp0_v.4s
+	sha256h		l3_abcd_q,l3_efgh_q,tmp1_v.4s
+	sha256h2	l2_efgh_q,tmp2_q,tmp0_v.4s
+	sha256h2	l3_efgh_q,tmp3_q,tmp1_v.4s
+
+
+.endm
+/**
+maros for round 0-47
+*/
+.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req
+	sha256su0		l0_\msg0\()_v.4s,l0_\msg1\()_v.4s
+	sha256su0		l1_\msg0\()_v.4s,l1_\msg1\()_v.4s
+	sha256su0		l2_\msg0\()_v.4s,l2_\msg1\()_v.4s
+	sha256su0		l3_\msg0\()_v.4s,l3_\msg1\()_v.4s
+	sha256_4_rounds_high	\msg1,\tmp0
+	sha256su1		l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+	sha256su1		l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s
+	sha256su1		l2_\msg0\()_v.4s,l2_\msg2\()_v.4s,l2_\msg3\()_v.4s
+	sha256su1		l3_\msg0\()_v.4s,l3_\msg2\()_v.4s,l3_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+	declare_var_vector_reg	key,15
+
+
+/*
+digest variables
+*/
+	declare_var_vector_reg	l0_abcd,0
+	declare_var_vector_reg	l0_efgh,1
+	declare_var_vector_reg	l1_abcd,2
+	declare_var_vector_reg	l1_efgh,3
+	declare_var_vector_reg	l2_abcd,4
+	declare_var_vector_reg	l2_efgh,5
+	declare_var_vector_reg	l3_abcd,6
+	declare_var_vector_reg	l3_efgh,7
+	declare_var_vector_reg	l1_abcd_saved,16
+	declare_var_vector_reg	l1_efgh_saved,17
+	declare_var_vector_reg	l0_abcd_saved,20
+	declare_var_vector_reg	l0_efgh_saved,21
+	declare_var_vector_reg	l2_abcd_saved,24
+	declare_var_vector_reg	l2_efgh_saved,25
+	declare_var_vector_reg	l3_abcd_saved,28
+	declare_var_vector_reg	l3_efgh_saved,29
+/*
+Temporay variables
+*/
+	declare_var_vector_reg	l0_tmp0,8
+	declare_var_vector_reg	l1_tmp0,9
+	declare_var_vector_reg	l2_tmp0,10
+	declare_var_vector_reg	l3_tmp0,11
+
+	declare_var_vector_reg	tmp0,12
+	declare_var_vector_reg	tmp1,13
+	declare_var_vector_reg	tmp2,14
+	declare_var_vector_reg	tmp3,15
+
+/*
+Message variables
+*/
+	declare_var_vector_reg	l0_msg0,16
+	declare_var_vector_reg	l0_msg1,17
+	declare_var_vector_reg	l0_msg2,18
+	declare_var_vector_reg	l0_msg3,19
+	declare_var_vector_reg	l1_msg0,20
+	declare_var_vector_reg	l1_msg1,21
+	declare_var_vector_reg	l1_msg2,22
+	declare_var_vector_reg	l1_msg3,23
+	declare_var_vector_reg	l2_msg0,24
+	declare_var_vector_reg	l2_msg1,25
+	declare_var_vector_reg	l2_msg2,26
+	declare_var_vector_reg	l2_msg3,27
+	declare_var_vector_reg	l3_msg0,28
+	declare_var_vector_reg	l3_msg1,29
+	declare_var_vector_reg	l3_msg2,30
+	declare_var_vector_reg	l3_msg3,31
+
+
+
+/*
+	void sha256_mb_ce_x4(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int);
+*/
+/*
+Arguements list
+*/
+	l0_job 	.req	x0
+	l1_job 	.req	x1
+	l2_job 	.req	x2
+	l3_job 	.req	x3
+	len	.req	w4
+	l0_data	.req	x5
+	l1_data	.req	x6
+	l2_data	.req	x7
+	l3_data	.req	x8
+	tmp	.req	x9
+	.global	sha256_mb_ce_x4
+	.type	sha256_mb_ce_x4, %function
+sha256_mb_ce_x4:
+	//push d8~d15
+	stp 	d8,d9,[sp,-192]!
+	stp 	d10,d11,[sp,16]
+	stp 	d12,d13,[sp,32]
+	stp 	d14,d15,[sp,48]
+	ldr	l0_data, [l0_job]
+	ldr	l0_abcd_q, [l0_job, 64]
+	ldr	l0_efgh_q, [l0_job, 80]
+	ldr	l1_data,   [l1_job]
+	ldr	l1_abcd_q, [l1_job, 64]
+	ldr	l1_efgh_q, [l1_job, 80]
+	ldr	l2_data,   [l2_job]
+	ldr	l2_abcd_q, [l2_job, 64]
+	ldr	l2_efgh_q, [l2_job, 80]
+	ldr	l3_data,   [l3_job]
+	ldr	l3_abcd_q, [l3_job, 64]
+	ldr	l3_efgh_q, [l3_job, 80]
+
+
+
+start_loop:
+
+	//load key addr
+	adr	tmp, KEY
+	//load msgs
+	ld1	{l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+	ld1	{l1_msg0_v.4s-l1_msg3_v.4s},[l1_data]
+	ld1	{l2_msg0_v.4s-l2_msg3_v.4s},[l2_data]
+	ld1	{l3_msg0_v.4s-l3_msg3_v.4s},[l3_data]
+	ldr	key_q,[tmp]
+	add	tmp,tmp,16
+	//adjust loop parameter
+	add	l0_data,l0_data,64
+	add	l1_data,l1_data,64
+	add	l2_data,l2_data,64
+	add	l3_data,l3_data,64
+	sub	len, len, #1
+	cmp	len, 0
+
+
+	rev32	l0_msg0_v.16b,l0_msg0_v.16b
+	rev32	l0_msg1_v.16b,l0_msg1_v.16b
+	add	l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s
+	rev32	l0_msg2_v.16b,l0_msg2_v.16b
+	rev32	l0_msg3_v.16b,l0_msg3_v.16b
+
+	rev32	l1_msg0_v.16b,l1_msg0_v.16b
+	rev32	l1_msg1_v.16b,l1_msg1_v.16b
+	add	l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s
+	rev32	l1_msg2_v.16b,l1_msg2_v.16b
+	rev32	l1_msg3_v.16b,l1_msg3_v.16b
+
+	rev32	l2_msg0_v.16b,l2_msg0_v.16b
+	rev32	l2_msg1_v.16b,l2_msg1_v.16b
+	add	l2_tmp0_v.4s, l2_msg0_v.4s,key_v.4s
+	rev32	l2_msg2_v.16b,l2_msg2_v.16b
+	rev32	l2_msg3_v.16b,l2_msg3_v.16b
+
+	rev32	l3_msg0_v.16b,l3_msg0_v.16b
+	rev32	l3_msg1_v.16b,l3_msg1_v.16b
+	add	l3_tmp0_v.4s, l3_msg0_v.4s,key_v.4s
+	rev32	l3_msg2_v.16b,l3_msg2_v.16b
+	rev32	l3_msg3_v.16b,l3_msg3_v.16b
+
+
+
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0    /* rounds 0-3 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp0
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0    /* rounds 16-19 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp0
+	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0    /* rounds 32-35 */
+	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp0
+	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0
+	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp0
+
+
+
+	sha256_4_rounds_high	msg1,tmp0			/* rounds 48-51 */
+
+	/* msg0 msg1 is free , share with digest regs */
+	ldr	l0_abcd_saved_q, [l0_job, 64]
+	ldr	l1_abcd_saved_q, [l1_job, 64]
+	ldr	l2_abcd_saved_q, [l2_job, 64]
+	ldr	l3_abcd_saved_q, [l3_job, 64]
+	ldr	l0_efgh_saved_q, [l0_job, 80]
+	ldr	l1_efgh_saved_q, [l1_job, 80]
+	ldr	l2_efgh_saved_q, [l2_job, 80]
+	ldr	l3_efgh_saved_q, [l3_job, 80]
+
+	sha256_4_rounds_high	msg2,tmp0
+	sha256_4_rounds_high	msg3,tmp0
+
+	/* rounds 60-63 */
+	mov		tmp2_v.16b,l0_abcd_v.16b
+	sha256h		l0_abcd_q,l0_efgh_q,l0_tmp0_v.4s
+	sha256h2	l0_efgh_q,tmp2_q,l0_tmp0_v.4s
+
+	mov		tmp2_v.16b,l1_abcd_v.16b
+	sha256h		l1_abcd_q,l1_efgh_q,l1_tmp0_v.4s
+	sha256h2	l1_efgh_q,tmp2_q,l1_tmp0_v.4s
+
+	mov		tmp2_v.16b,l2_abcd_v.16b
+	sha256h		l2_abcd_q,l2_efgh_q,l2_tmp0_v.4s
+	sha256h2	l2_efgh_q,tmp2_q,l2_tmp0_v.4s
+
+	mov		tmp2_v.16b,l3_abcd_v.16b
+	sha256h		l3_abcd_q,l3_efgh_q,l3_tmp0_v.4s
+	sha256h2	l3_efgh_q,tmp2_q,l3_tmp0_v.4s
+
+	/* combine state */
+	add     l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s
+	add     l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s
+	add     l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s
+	add     l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s
+	add     l2_abcd_v.4s,l2_abcd_v.4s,l2_abcd_saved_v.4s
+	add     l2_efgh_v.4s,l2_efgh_v.4s,l2_efgh_saved_v.4s
+	add     l3_abcd_v.4s,l3_abcd_v.4s,l3_abcd_saved_v.4s
+	add     l3_efgh_v.4s,l3_efgh_v.4s,l3_efgh_saved_v.4s
+
+	str	l0_abcd_q,	[l0_job, 64]
+	str	l0_efgh_q, 	[l0_job, 80]
+	str	l1_abcd_q,	[l1_job, 64]
+	str	l1_efgh_q, 	[l1_job, 80]
+	str	l2_abcd_q,	[l2_job, 64]
+	str	l2_efgh_q, 	[l2_job, 80]
+	str	l3_abcd_q,	[l3_job, 64]
+	str	l3_efgh_q, 	[l3_job, 80]
+
+	bgt	start_loop
+
+
+	ldp 	d10,d11,[sp,16]
+	ldp 	d12,d13,[sp,32]
+	ldp 	d14,d15,[sp,48]
+	ldp     d8, d9, [sp], 192
+	ret
+
+	.size	sha256_mb_ce_x4, .-sha256_mb_ce_x4
+	.section	.rol0_data.cst16,"aM",@progbits,16
+	.align	4
+KEY:
+	.word 0x428A2F98
+	.word 0x71374491
+	.word 0xB5C0FBCF
+	.word 0xE9B5DBA5
+	.word 0x3956C25B
+	.word 0x59F111F1
+	.word 0x923F82A4
+	.word 0xAB1C5ED5
+	.word 0xD807AA98
+	.word 0x12835B01
+	.word 0x243185BE
+	.word 0x550C7DC3
+	.word 0x72BE5D74
+	.word 0x80DEB1FE
+	.word 0x9BDC06A7
+	.word 0xC19BF174
+	.word 0xE49B69C1
+	.word 0xEFBE4786
+	.word 0x0FC19DC6
+	.word 0x240CA1CC
+	.word 0x2DE92C6F
+	.word 0x4A7484AA
+	.word 0x5CB0A9DC
+	.word 0x76F988DA
+	.word 0x983E5152
+	.word 0xA831C66D
+	.word 0xB00327C8
+	.word 0xBF597FC7
+	.word 0xC6E00BF3
+	.word 0xD5A79147
+	.word 0x06CA6351
+	.word 0x14292967
+	.word 0x27B70A85
+	.word 0x2E1B2138
+	.word 0x4D2C6DFC
+	.word 0x53380D13
+	.word 0x650A7354
+	.word 0x766A0ABB
+	.word 0x81C2C92E
+	.word 0x92722C85
+	.word 0xA2BFE8A1
+	.word 0xA81A664B
+	.word 0xC24B8B70
+	.word 0xC76C51A3
+	.word 0xD192E819
+	.word 0xD6990624
+	.word 0xF40E3585
+	.word 0x106AA070
+	.word 0x19A4C116
+	.word 0x1E376C08
+	.word 0x2748774C
+	.word 0x34B0BCB5
+	.word 0x391C0CB3
+	.word 0x4ED8AA4A
+	.word 0x5B9CCA4F
+	.word 0x682E6FF3
+	.word 0x748F82EE
+	.word 0x78A5636F
+	.word 0x84C87814
+	.word 0x8CC70208
+	.word 0x90BEFFFA
+	.word 0xA4506CEB
+	.word 0xBEF9A3F7
+	.word 0xC67178F2
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c
new file mode 100644
index 000000000..12441a8e3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c
@@ -0,0 +1,268 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx")
+#endif
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx(SHA256_HASH_CTX_MGR * mgr)
+{
+	sha256_mb_mgr_init_avx(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+					   const void *buffer, uint32_t len,
+					   HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr,
+									   &ctx->job);
+		}
+	}
+
+	return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx(SHA256_HASH_CTX_MGR * mgr)
+{
+	SHA256_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA256_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA256_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr,
+										   &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr,
+									   &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+	static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+	    { SHA256_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA256_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx_slver_02020154;
+struct slver sha256_ctx_mgr_init_avx_slver = { 0x0154, 0x02, 0x02 };
+
+struct slver sha256_ctx_mgr_submit_avx_slver_02020155;
+struct slver sha256_ctx_mgr_submit_avx_slver = { 0x0155, 0x02, 0x02 };
+
+struct slver sha256_ctx_mgr_flush_avx_slver_02020156;
+struct slver sha256_ctx_mgr_flush_avx_slver = { 0x0156, 0x02, 0x02 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c
new file mode 100644
index 000000000..9c045659e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c
@@ -0,0 +1,268 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx2(SHA256_HASH_CTX_MGR * mgr)
+{
+	sha256_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx2(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+					    const void *buffer, uint32_t len,
+					    HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr,
+									    &ctx->job);
+		}
+	}
+
+	return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx2(SHA256_HASH_CTX_MGR * mgr)
+{
+	SHA256_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx2(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA256_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA256_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr,
+										    &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr,
+									    &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+	static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+	    { SHA256_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA256_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx2_slver_04020157;
+struct slver sha256_ctx_mgr_init_avx2_slver = { 0x0157, 0x02, 0x04 };
+
+struct slver sha256_ctx_mgr_submit_avx2_slver_04020158;
+struct slver sha256_ctx_mgr_submit_avx2_slver = { 0x0158, 0x02, 0x04 };
+
+struct slver sha256_ctx_mgr_flush_avx2_slver_04020159;
+struct slver sha256_ctx_mgr_flush_avx2_slver = { 0x0159, 0x02, 0x04 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c
new file mode 100644
index 000000000..a1f068987
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c
@@ -0,0 +1,273 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx512(SHA256_HASH_CTX_MGR * mgr)
+{
+	sha256_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx512(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+					      const void *buffer, uint32_t len,
+					      HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+									      &ctx->job);
+		}
+	}
+
+	return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx512(SHA256_HASH_CTX_MGR * mgr)
+{
+	SHA256_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx512(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA256_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA256_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx =
+				    (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+										    &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+									      &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+	static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+	    { SHA256_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA256_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx512_slver_0600015a;
+struct slver sha256_ctx_mgr_init_avx512_slver = { 0x015a, 0x00, 0x06 };
+
+struct slver sha256_ctx_mgr_submit_avx512_slver_0600015b;
+struct slver sha256_ctx_mgr_submit_avx512_slver = { 0x015b, 0x00, 0x06 };
+
+struct slver sha256_ctx_mgr_flush_avx512_slver_0600015c;
+struct slver sha256_ctx_mgr_flush_avx512_slver = { 0x015c, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c
new file mode 100644
index 000000000..763057f12
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c
@@ -0,0 +1,283 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+/**
+ *  sha256_ctx_avx512_ni related functions are aiming to utilize Canon Lake.
+ *  Since SHANI is still slower than multibuffer for full lanes,
+ *  sha256_ctx_mgr_init_avx512_ni and sha256_ctx_mgr_submit_avx512_ni are
+ *  similare with their avx512 versions.
+ *  sha256_ctx_mgr_flush_avx512_ni is different. It will call
+ *  sha256_mb_mgr_flush_avx512_ni which would use shani when lanes are less
+ *  than a threshold.
+ *
+ */
+#if defined(HAVE_AS_KNOWS_AVX512) && defined(HAVE_AS_KNOWS_SHANI)
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx512_ni(SHA256_HASH_CTX_MGR * mgr)
+{
+	sha256_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx512_ni(SHA256_HASH_CTX_MGR * mgr,
+						 SHA256_HASH_CTX * ctx, const void *buffer,
+						 uint32_t len, HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+									      &ctx->job);
+		}
+	}
+
+	return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx512_ni(SHA256_HASH_CTX_MGR * mgr)
+{
+	SHA256_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx512_ni(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA256_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA256_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx =
+				    (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+										    &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+									      &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+	static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+	    { SHA256_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA256_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx512_ni_slver_080002ca;
+struct slver sha256_ctx_mgr_init_avx512_ni_slver = { 0x02ca, 0x00, 0x08 };
+
+struct slver sha256_ctx_mgr_submit_avx512_ni_slver_080002cb;
+struct slver sha256_ctx_mgr_submit_avx512_ni_slver = { 0x02cb, 0x00, 0x08 };
+
+struct slver sha256_ctx_mgr_flush_avx512_ni_slver_080002cc;
+struct slver sha256_ctx_mgr_flush_avx512_ni_slver = { 0x02cc, 0x00, 0x08 };
+
+#endif // HAVE_AS_KNOWS_AVX512 and HAVE_AS_KNOWS_SHANI
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c
new file mode 100644
index 000000000..58bf024a0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c
@@ -0,0 +1,301 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define ror32(x, r) (((x)>>(r)) ^ ((x)<<(32-(r))))
+
+#define W(x) w[(x) & 15]
+
+#define S0(w) (ror32(w,7) ^ ror32(w,18) ^ (w >> 3))
+#define S1(w) (ror32(w,17) ^ ror32(w,19) ^ (w >> 10))
+
+#define s0(a) (ror32(a,2) ^ ror32(a,13) ^ ror32(a,22))
+#define s1(e) (ror32(e,6) ^ ror32(e,11) ^ ror32(e,25))
+#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c))
+#define ch(e,f,g) ((e & f) ^ (g & ~e))
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+	if (i<16) W(i) = to_be32(ww[i]); \
+	else \
+	W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+	t2 = s0(a) + maj(a,b,c); \
+	t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+	d += t1; \
+	h = t1 + t2;
+
+static void sha256_init(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static uint32_t sha256_update(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static void sha256_final(SHA256_HASH_CTX * ctx, uint32_t remain_len);
+static void OPT_FIX sha256_single(const void *data, uint32_t digest[]);
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+
+void sha256_ctx_mgr_init_base(SHA256_HASH_CTX_MGR * mgr)
+{
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_base(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+					    const void *buffer, uint32_t len,
+					    HASH_CTX_FLAG flags)
+{
+	uint32_t remain_len;
+
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) {
+		// Cannot submit a new entire job to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags == HASH_FIRST) {
+
+		sha256_init(ctx, buffer, len);
+		sha256_update(ctx, buffer, len);
+	}
+
+	if (flags == HASH_UPDATE) {
+		sha256_update(ctx, buffer, len);
+	}
+
+	if (flags == HASH_LAST) {
+		remain_len = sha256_update(ctx, buffer, len);
+		sha256_final(ctx, remain_len);
+	}
+
+	if (flags == HASH_ENTIRE) {
+		sha256_init(ctx, buffer, len);
+		remain_len = sha256_update(ctx, buffer, len);
+		sha256_final(ctx, remain_len);
+	}
+
+	return ctx;
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_base(SHA256_HASH_CTX_MGR * mgr)
+{
+	return NULL;
+}
+
+static void sha256_init(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+	// Init digest
+	hash_init_digest(ctx->job.result_digest);
+
+	// Reset byte counter
+	ctx->total_length = 0;
+
+	// Clear extra blocks
+	ctx->partial_block_buffer_length = 0;
+
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Mark it as processing
+	ctx->status = HASH_CTX_STS_PROCESSING;
+}
+
+static uint32_t sha256_update(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+	uint32_t remain_len = len;
+	uint32_t *digest = ctx->job.result_digest;
+
+	while (remain_len >= SHA256_BLOCK_SIZE) {
+		sha256_single(buffer, digest);
+		buffer = (void *)((uint8_t *) buffer + SHA256_BLOCK_SIZE);
+		remain_len -= SHA256_BLOCK_SIZE;
+		ctx->total_length += SHA256_BLOCK_SIZE;
+	}
+	ctx->status = HASH_CTX_STS_IDLE;
+	ctx->incoming_buffer = buffer;
+	return remain_len;
+}
+
+static void sha256_final(SHA256_HASH_CTX * ctx, uint32_t remain_len)
+{
+	const void *buffer = ctx->incoming_buffer;
+	uint32_t i = remain_len, j;
+	uint8_t buf[2 * SHA256_BLOCK_SIZE];
+	uint32_t *digest = ctx->job.result_digest;
+
+	ctx->total_length += i;
+	memcpy(buf, buffer, i);
+	buf[i++] = 0x80;
+	for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - SHA256_PADLENGTHFIELD_SIZE); j++)
+		buf[j] = 0;
+
+	if (i > SHA256_BLOCK_SIZE - SHA256_PADLENGTHFIELD_SIZE)
+		i = 2 * SHA256_BLOCK_SIZE;
+	else
+		i = SHA256_BLOCK_SIZE;
+
+	*(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8);
+
+	sha256_single(buf, digest);
+	if (i == 2 * SHA256_BLOCK_SIZE) {
+		sha256_single(buf + SHA256_BLOCK_SIZE, digest);
+	}
+
+	ctx->status = HASH_CTX_STS_COMPLETE;
+}
+
+void sha256_single(const void *data, uint32_t digest[])
+{
+	uint32_t a, b, c, d, e, f, g, h, t1, t2;
+	uint32_t w[16];
+	uint32_t *ww = (uint32_t *) data;
+
+	a = digest[0];
+	b = digest[1];
+	c = digest[2];
+	d = digest[3];
+	e = digest[4];
+	f = digest[5];
+	g = digest[6];
+	h = digest[7];
+
+	step(0, a, b, c, d, e, f, g, h, 0x428a2f98);
+	step(1, h, a, b, c, d, e, f, g, 0x71374491);
+	step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf);
+	step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5);
+	step(4, e, f, g, h, a, b, c, d, 0x3956c25b);
+	step(5, d, e, f, g, h, a, b, c, 0x59f111f1);
+	step(6, c, d, e, f, g, h, a, b, 0x923f82a4);
+	step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5);
+	step(8, a, b, c, d, e, f, g, h, 0xd807aa98);
+	step(9, h, a, b, c, d, e, f, g, 0x12835b01);
+	step(10, g, h, a, b, c, d, e, f, 0x243185be);
+	step(11, f, g, h, a, b, c, d, e, 0x550c7dc3);
+	step(12, e, f, g, h, a, b, c, d, 0x72be5d74);
+	step(13, d, e, f, g, h, a, b, c, 0x80deb1fe);
+	step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7);
+	step(15, b, c, d, e, f, g, h, a, 0xc19bf174);
+	step(16, a, b, c, d, e, f, g, h, 0xe49b69c1);
+	step(17, h, a, b, c, d, e, f, g, 0xefbe4786);
+	step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6);
+	step(19, f, g, h, a, b, c, d, e, 0x240ca1cc);
+	step(20, e, f, g, h, a, b, c, d, 0x2de92c6f);
+	step(21, d, e, f, g, h, a, b, c, 0x4a7484aa);
+	step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc);
+	step(23, b, c, d, e, f, g, h, a, 0x76f988da);
+	step(24, a, b, c, d, e, f, g, h, 0x983e5152);
+	step(25, h, a, b, c, d, e, f, g, 0xa831c66d);
+	step(26, g, h, a, b, c, d, e, f, 0xb00327c8);
+	step(27, f, g, h, a, b, c, d, e, 0xbf597fc7);
+	step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3);
+	step(29, d, e, f, g, h, a, b, c, 0xd5a79147);
+	step(30, c, d, e, f, g, h, a, b, 0x06ca6351);
+	step(31, b, c, d, e, f, g, h, a, 0x14292967);
+	step(32, a, b, c, d, e, f, g, h, 0x27b70a85);
+	step(33, h, a, b, c, d, e, f, g, 0x2e1b2138);
+	step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc);
+	step(35, f, g, h, a, b, c, d, e, 0x53380d13);
+	step(36, e, f, g, h, a, b, c, d, 0x650a7354);
+	step(37, d, e, f, g, h, a, b, c, 0x766a0abb);
+	step(38, c, d, e, f, g, h, a, b, 0x81c2c92e);
+	step(39, b, c, d, e, f, g, h, a, 0x92722c85);
+	step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1);
+	step(41, h, a, b, c, d, e, f, g, 0xa81a664b);
+	step(42, g, h, a, b, c, d, e, f, 0xc24b8b70);
+	step(43, f, g, h, a, b, c, d, e, 0xc76c51a3);
+	step(44, e, f, g, h, a, b, c, d, 0xd192e819);
+	step(45, d, e, f, g, h, a, b, c, 0xd6990624);
+	step(46, c, d, e, f, g, h, a, b, 0xf40e3585);
+	step(47, b, c, d, e, f, g, h, a, 0x106aa070);
+	step(48, a, b, c, d, e, f, g, h, 0x19a4c116);
+	step(49, h, a, b, c, d, e, f, g, 0x1e376c08);
+	step(50, g, h, a, b, c, d, e, f, 0x2748774c);
+	step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5);
+	step(52, e, f, g, h, a, b, c, d, 0x391c0cb3);
+	step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a);
+	step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f);
+	step(55, b, c, d, e, f, g, h, a, 0x682e6ff3);
+	step(56, a, b, c, d, e, f, g, h, 0x748f82ee);
+	step(57, h, a, b, c, d, e, f, g, 0x78a5636f);
+	step(58, g, h, a, b, c, d, e, f, 0x84c87814);
+	step(59, f, g, h, a, b, c, d, e, 0x8cc70208);
+	step(60, e, f, g, h, a, b, c, d, 0x90befffa);
+	step(61, d, e, f, g, h, a, b, c, 0xa4506ceb);
+	step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7);
+	step(63, b, c, d, e, f, g, h, a, 0xc67178f2);
+
+	digest[0] += a;
+	digest[1] += b;
+	digest[2] += c;
+	digest[3] += d;
+	digest[4] += e;
+	digest[5] += f;
+	digest[6] += g;
+	digest[7] += h;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+	static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+	    { SHA256_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_base_slver_000002f0;
+struct slver sha256_ctx_mgr_init_base_slver = { 0x02f0, 0x00, 0x00 };
+
+struct slver sha256_ctx_mgr_submit_base_slver_000002f1;
+struct slver sha256_ctx_mgr_submit_base_slver = { 0x02f1, 0x00, 0x00 };
+
+struct slver sha256_ctx_mgr_flush_base_slver_000002f2;
+struct slver sha256_ctx_mgr_flush_base_slver = { 0x02f2, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c
new file mode 100644
index 000000000..1483f631c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c
@@ -0,0 +1,54 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdint.h>
+#include <string.h>
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+
+extern void sha256_ctx_mgr_init_base(SHA256_HASH_CTX_MGR * mgr);
+extern SHA256_HASH_CTX *sha256_ctx_mgr_submit_base(SHA256_HASH_CTX_MGR * mgr,
+						   SHA256_HASH_CTX * ctx, const void *buffer,
+						   uint32_t len, HASH_CTX_FLAG flags);
+extern SHA256_HASH_CTX *sha256_ctx_mgr_flush_base(SHA256_HASH_CTX_MGR * mgr);
+
+void sha256_ctx_mgr_init(SHA256_HASH_CTX_MGR * mgr)
+{
+	return sha256_ctx_mgr_init_base(mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+				       const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	return sha256_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush(SHA256_HASH_CTX_MGR * mgr)
+{
+	return sha256_ctx_mgr_flush_base(mgr);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c
new file mode 100644
index 000000000..f85f5c88b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c
@@ -0,0 +1,256 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_sse(SHA256_HASH_CTX_MGR * mgr)
+{
+	sha256_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_sse(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+					   const void *buffer, uint32_t len,
+					   HASH_CTX_FLAG flags)
+{
+
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr,
+									   &ctx->job);
+		}
+	}
+
+	return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_sse(SHA256_HASH_CTX_MGR * mgr)
+{
+	SHA256_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_sse(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA256_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA256_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr,
+										   &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr,
+									   &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+	static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+	    { SHA256_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA256_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_sse_slver_00020151;
+struct slver sha256_ctx_mgr_init_sse_slver = { 0x0151, 0x02, 0x00 };
+
+struct slver sha256_ctx_mgr_submit_sse_slver_00020152;
+struct slver sha256_ctx_mgr_submit_sse_slver = { 0x0152, 0x02, 0x00 };
+
+struct slver sha256_ctx_mgr_flush_sse_slver_00020153;
+struct slver sha256_ctx_mgr_flush_sse_slver = { 0x0153, 0x02, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c
new file mode 100644
index 000000000..e2c7e2738
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c
@@ -0,0 +1,262 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_SHANI
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_sse_ni(SHA256_HASH_CTX_MGR * mgr)
+{
+	// Same with sse
+	sha256_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_sse_ni(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+					      const void *buffer, uint32_t len,
+					      HASH_CTX_FLAG flags)
+{
+
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse_ni(&mgr->mgr,
+									      &ctx->job);
+		}
+	}
+
+	return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_sse_ni(SHA256_HASH_CTX_MGR * mgr)
+{
+	SHA256_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_sse_ni(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+						SHA256_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA256_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA256_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx =
+				    (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse_ni(&mgr->mgr,
+										    &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+
+			ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse_ni(&mgr->mgr,
+									      &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+	static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+	    { SHA256_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA256_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_sse_ni_slver_070002c7;
+struct slver sha256_ctx_mgr_init_sse_ni_slver = { 0x02c7, 0x00, 0x07 };
+
+struct slver sha256_ctx_mgr_submit_sse_ni_slver_070002c8;
+struct slver sha256_ctx_mgr_submit_sse_ni_slver = { 0x02c8, 0x00, 0x07 };
+
+struct slver sha256_ctx_mgr_flush_sse_ni_slver_070002c9;
+struct slver sha256_ctx_mgr_flush_sse_ni_slver = { 0x02c9, 0x00, 0x07 };
+
+#endif // HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm
new file mode 100644
index 000000000..f9fb6d230
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm
@@ -0,0 +1,65 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN		0
+%define STS_BEING_PROCESSED	1
+%define STS_COMPLETED		2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Threshold constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; if number of lanes in use <= threshold, using sb func
+%define SHA256_SB_THRESHOLD_SSE		1
+%define SHA256_SB_THRESHOLD_AVX		1
+%define SHA256_SB_THRESHOLD_AVX2	1
+%define SHA256_SB_THRESHOLD_AVX512	1
+%define SHA256_NI_SB_THRESHOLD_SSE	4 ; shani is faster than sse sha256_mb
+%define SHA256_NI_SB_THRESHOLD_AVX512	6
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA256_JOB structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS	; SHA256_JOB
+
+;;;	name				size	align
+FIELD	_buffer,			8,	8	; pointer to buffer
+FIELD	_len,				8,	8	; length in bytes
+FIELD	_result_digest,			8*4,	64	; Digest (output)
+FIELD	_status,			4,	4
+FIELD	_user_data,			8,	8
+
+%assign _SHA256_JOB_size	_FIELD_OFFSET
+%assign _SHA256_JOB_align	_STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c
new file mode 100644
index 000000000..28f1f5118
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c
@@ -0,0 +1,146 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha256_mb.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS (SHA256_MAX_LANES - 1)
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+uint8_t lens_print_and_check(SHA256_HASH_CTX_MGR * mgr)
+{
+	static int32_t last_lens[SHA256_MAX_LANES] = { 0 };
+	int32_t len;
+	uint8_t num_unchanged = 0;
+	int i;
+	for (i = 0; i < SHA256_MAX_LANES; i++) {
+		len = (int32_t) mgr->mgr.lens[i];
+		// len[i] in mgr consists of byte_length<<4 | lane_index
+		len = (len >= 16) ? (len >> 4 << 6) : 0;
+		printf("\t%d", len);
+		if (last_lens[i] > 0 && last_lens[i] == len)
+			num_unchanged += 1;
+		last_lens[i] = len;
+	}
+	printf("\n");
+	return num_unchanged;
+}
+
+int main(void)
+{
+	SHA256_HASH_CTX_MGR *mgr = NULL;
+	SHA256_HASH_CTX ctxpool[TEST_BUFS];
+	uint32_t i, j, fail = 0;
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t lens[TEST_BUFS];
+	uint8_t num_ret, num_unchanged = 0;
+	int ret;
+
+	printf("sha256_mb flush test, %d buffers with %d length: \n", TEST_BUFS, TEST_LEN);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha256_ctx_mgr_init(mgr);
+
+	srand(TEST_SEED);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocate  and fill buffer
+		lens[i] = TEST_LEN / SHA256_MAX_LANES * (i + 1);
+		bufs[i] = (unsigned char *)malloc(lens[i]);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], lens[i]);
+	}
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Init ctx contexts
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// Run reference test
+		sha256_ref(bufs[i], digest_ref[i], lens[i]);
+
+		// Run sb_sha256 test
+		sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+	}
+
+	printf("Changes of lens inside mgr:\n");
+	lens_print_and_check(mgr);
+	while (sha256_ctx_mgr_flush(mgr)) {
+		num_ret = lens_print_and_check(mgr);
+		num_unchanged = num_unchanged > num_ret ? num_unchanged : num_ret;
+	}
+	printf("Info of sha256_mb lens prints over\n");
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("Test%d fixed size, digest%d "
+				       "fail 0x%08X <=> 0x%08X \n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else if (num_unchanged)
+		printf("SHA-NI is used when %d or %d jobs are uncompleted\n",
+		       num_unchanged, num_unchanged + 1);
+	else
+		printf("SHA-NI is not used, or used for last job\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..ebba9ca36
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm
@@ -0,0 +1,74 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA256 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; LANE_DATA
+;;;     name            size    align
+FIELD   _job_in_lane,   8,      8       ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align        _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; SHA256_ARGS_X16
+;;;     name            size    align
+FIELD   _digest,        4*8*16,  4       ; transposed digest
+FIELD   _data_ptr,      8*16,    8       ; array of pointers to data
+END_FIELDS
+
+%assign _SHA256_ARGS_X4_size    _FIELD_OFFSET
+%assign _SHA256_ARGS_X4_align   _STRUCT_ALIGN
+%assign _SHA256_ARGS_X8_size	_FIELD_OFFSET
+%assign _SHA256_ARGS_X8_align	_STRUCT_ALIGN
+%assign _SHA256_ARGS_X16_size	_FIELD_OFFSET
+%assign _SHA256_ARGS_X16_align	_STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; MB_MGR
+;;;     name            size    align
+FIELD   _args,          _SHA256_ARGS_X4_size, _SHA256_ARGS_X4_align
+FIELD   _lens,          4*16,    8
+FIELD   _unused_lanes,  8,      8
+FIELD   _ldata,         _LANE_DATA_size*16, _LANE_DATA_align
+FIELD   _num_lanes_inuse, 4,    4
+END_FIELDS
+
+%assign _MB_MGR_size    _FIELD_OFFSET
+%assign _MB_MGR_align   _STRUCT_ALIGN
+
+_args_digest    equ     _args + _digest
+_args_data_ptr  equ     _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm
new file mode 100644
index 000000000..69f27f42d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm
@@ -0,0 +1,253 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x4_avx
+extern sha256_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rsi
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4            r8
+%define lens0           r8
+
+%define lens1           r9
+%define lens2           r10
+%define lens3           r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*3
+_ALIGN_SIZE     equ 0
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_avx, function
+sha256_mb_mgr_flush_avx:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*2], rsi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	; use num_lanes_inuse to judge all lanes are empty
+	cmp	dword [state + _num_lanes_inuse], 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+	cmp     qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [one]
+	cmp     qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [two]
+	cmp     qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [three]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     DWORD(lens2), [state + _lens + 2*4]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     DWORD(lens3), [state + _lens + 3*4]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+
+	; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+	cmp	dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX
+	ja	mb_processing
+
+	; lensN-len2=idx
+	shr     len2, 4
+	mov     [state + _lens + idx*4], DWORD(idx)
+	mov	r10, idx
+	or	r10, 0x1000	; avx has 4 lanes *4, r10b is idx, r10b2 is 16
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha256_opt_x1
+	; state and idx are intact
+	jmp	len_is_0
+
+mb_processing:
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     [state + _lens + 2*4], DWORD(lens2)
+	mov     [state + _lens + 3*4], DWORD(lens3)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha256_mb_x4_avx
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	vmovd    xmm0, [state + _args_digest + 4*idx + 0*16]
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
+	vmovd    xmm1, [state + _args_digest + 4*idx + 4*16]
+	vpinsrd  xmm1, [state + _args_digest + 4*idx + 5*16], 1
+	vpinsrd  xmm1, [state + _args_digest + 4*idx + 6*16], 2
+	vpinsrd  xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6,  [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7,  [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8,  [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9,  [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     r12, [rsp + _GPR_SAVE + 8*1]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+one:    dq  1
+two:    dq  2
+three:  dq  3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..0ee0589cf
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm
@@ -0,0 +1,274 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x8_avx2
+extern sha256_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+%define tmp4    rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%define tmp4    rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+; idx must be a register not clobberred by sha256_mb_x8_avx2 and sha256_opt_x1
+%define idx             rbp
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*8
+_ALIGN_SIZE     equ 8
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx2(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_avx2, function
+sha256_mb_mgr_flush_avx2:
+	endbranch
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*3], rbp
+	mov     [rsp + _GPR_SAVE + 8*4], r12
+	mov     [rsp + _GPR_SAVE + 8*5], r13
+	mov     [rsp + _GPR_SAVE + 8*6], r14
+	mov     [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*1], rsi
+	mov     [rsp + _GPR_SAVE + 8*2], rdi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	; use num_lanes_inuse to judge all lanes are empty
+	cmp	dword [state + _num_lanes_inuse], 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor	idx, idx
+	cmp	qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [one]
+	cmp	qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [two]
+	cmp	qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [three]
+	cmp	qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [four]
+	cmp	qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [five]
+	cmp	qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [six]
+	cmp	qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [seven]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov	tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _args + _data_ptr + 8*I], tmp
+	mov	dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	vmovdqa xmm0, [state + _lens + 0*16]
+	vmovdqa xmm1, [state + _lens + 1*16]
+
+	vpminud xmm2, xmm0, xmm1        ; xmm2 has {D,C,B,A}
+	vpalignr xmm3, xmm3, xmm2, 8    ; xmm3 has {x,x,D,C}
+	vpminud xmm2, xmm2, xmm3        ; xmm2 has {x,x,E,F}
+	vpalignr xmm3, xmm3, xmm2, 4    ; xmm3 has {x,x,x,E}
+	vpminud xmm2, xmm2, xmm3        ; xmm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+	; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+	cmp	dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX2
+	ja	mb_processing
+
+	; lensN-len2=idx
+	mov     [state + _lens + idx*4], DWORD(idx)
+	mov	r10, idx
+	or	r10, 0x2000	; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha256_opt_x1
+	; state and idx are intact
+	jmp	len_is_0
+
+mb_processing:
+
+	vpand   xmm2, xmm2, [rel clear_low_nibble]
+	vpshufd xmm2, xmm2, 0
+
+	vpsubd  xmm0, xmm0, xmm2
+	vpsubd  xmm1, xmm1, xmm2
+
+	vmovdqa [state + _lens + 0*16], xmm0
+	vmovdqa [state + _lens + 1*16], xmm1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha256_mb_x8_avx2
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	mov	unused_lanes, [state + _unused_lanes]
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*4*8]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*4*8], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*4*8], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*4*8], 3
+	vmovd	xmm1, [state + _args_digest + 4*idx + 4*4*8]
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 5*4*8], 1
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 6*4*8], 2
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 7*4*8], 3
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+	vmovdqa	[job_rax + _result_digest + 1*16], xmm1
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*1]
+	mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     rbp, [rsp + _GPR_SAVE + 8*3]
+	mov     r12, [rsp + _GPR_SAVE + 8*4]
+	mov     r13, [rsp + _GPR_SAVE + 8*5]
+	mov     r14, [rsp + _GPR_SAVE + 8*6]
+	mov     r15, [rsp + _GPR_SAVE + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+one:	dq  1
+two:	dq  2
+three:	dq  3
+four:	dq  4
+five:	dq  5
+six:	dq  6
+seven:	dq  7
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..201cd42b0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm
@@ -0,0 +1,288 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sha256_mb_x16_avx512
+extern sha256_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+%define tmp4    rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%define tmp4    rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+; idx must be a register not clobberred by sha256_mb_x16_avx2 and sha256_opt_x1
+%define idx             rbp
+
+%define num_lanes_inuse r9
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*8
+_ALIGN_SIZE     equ 8
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx512(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_avx512, function
+sha256_mb_mgr_flush_avx512:
+	endbranch
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*3], rbp
+	mov     [rsp + _GPR_SAVE + 8*4], r12
+	mov     [rsp + _GPR_SAVE + 8*5], r13
+	mov     [rsp + _GPR_SAVE + 8*6], r14
+	mov     [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*1], rsi
+	mov     [rsp + _GPR_SAVE + 8*2], rdi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	cmp	num_lanes_inuse, 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor	idx, idx
+%assign I 1
+%rep 15
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov	tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _args + _data_ptr + 8*I], tmp
+	mov	dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	vmovdqu ymm0, [state + _lens + 0*32]
+	vmovdqu ymm1, [state + _lens + 1*32]
+
+	vpminud ymm2, ymm0, ymm1        ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+	vpalignr ymm3, ymm3, ymm2, 8    ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+	vpalignr ymm3, ymm3, ymm2, 4    ; ymm3 has {x,x, x,H2,x,x, x,D2}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x, x,G3,x,x, x,C3}
+	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has {x,x, x, x,x,x, x,C3}
+        vpminud ymm2, ymm2, ymm3        ; ymm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+	; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+	cmp	dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX512
+	ja	mb_processing
+
+	; lensN-len2=idx
+	mov     [state + _lens + idx*4], DWORD(idx)
+	mov	r10, idx
+	or	r10, 0x4000	; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha256_opt_x1
+	; state and idx are intact
+	jmp	len_is_0
+
+mb_processing:
+
+	vpand   ymm2, ymm2, [rel clear_low_nibble]
+        vpshufd ymm2, ymm2, 0
+
+        vpsubd  ymm0, ymm0, ymm2
+        vpsubd  ymm1, ymm1, ymm2
+
+        vmovdqu [state + _lens + 0*32], ymm0
+        vmovdqu [state + _lens + 1*32], ymm1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha256_mb_x16_avx512
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	mov	unused_lanes, [state + _unused_lanes]
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+        mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        sub     num_lanes_inuse, 1
+        mov     [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*4*16]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+	vmovd	xmm1, [state + _args_digest + 4*idx + 4*4*16]
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+	vmovdqa	[job_rax + _result_digest + 1*16], xmm1
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*1]
+	mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     rbp, [rsp + _GPR_SAVE + 8*3]
+	mov     r12, [rsp + _GPR_SAVE + 8*4]
+	mov     r13, [rsp + _GPR_SAVE + 8*5]
+	mov     r14, [rsp + _GPR_SAVE + 8*6]
+	mov     r15, [rsp + _GPR_SAVE + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1:     dq  1
+lane_2:     dq  2
+lane_3:     dq  3
+lane_4:     dq  4
+lane_5:     dq  5
+lane_6:     dq  6
+lane_7:     dq  7
+lane_8:     dq  8
+lane_9:     dq  9
+lane_10:    dq  10
+lane_11:    dq  11
+lane_12:    dq  12
+lane_13:    dq  13
+lane_14:    dq  14
+lane_15:    dq  15
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_mb_mgr_flush_avx512
+no_sha256_mb_mgr_flush_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm
new file mode 100644
index 000000000..7bc9d32a4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm
@@ -0,0 +1,295 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ %ifdef HAVE_AS_KNOWS_SHANI
+
+extern sha256_mb_x16_avx512
+extern sha256_ni_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+%define tmp4    rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%define tmp4    rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+; idx must be a register not clobberred by sha256_mb_x16_avx2 and sha256_opt_x1
+%define idx             rbp
+
+%define num_lanes_inuse r9
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*8
+_ALIGN_SIZE     equ 8
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx512_ni(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_avx512_ni, function
+sha256_mb_mgr_flush_avx512_ni:
+	endbranch
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*3], rbp
+	mov     [rsp + _GPR_SAVE + 8*4], r12
+	mov     [rsp + _GPR_SAVE + 8*5], r13
+	mov     [rsp + _GPR_SAVE + 8*6], r14
+	mov     [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*1], rsi
+	mov     [rsp + _GPR_SAVE + 8*2], rdi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	cmp     num_lanes_inuse, 0
+	jz      return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+%assign I 1
+%rep 15
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	vmovdqu ymm0, [state + _lens + 0*32]
+	vmovdqu ymm1, [state + _lens + 1*32]
+
+	vpminud ymm2, ymm0, ymm1        ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+	vpalignr ymm3, ymm3, ymm2, 8    ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+	vpalignr ymm3, ymm3, ymm2, 4    ; ymm3 has {x,x, x,H2,x,x, x,D2}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x, x,G3,x,x, x,C3}
+	vperm2i128 ymm3, ymm2, ymm2, 1  ; ymm3 has {x,x, x, x,x,x, x,C3}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov     len2, idx
+	and     idx, 0xF
+	shr     len2, 4
+	jz      len_is_0
+
+	; compare with shani-sb threshold, if num_lanes_inuse <= threshold, using shani func
+	cmp     dword [state + _num_lanes_inuse], SHA256_NI_SB_THRESHOLD_AVX512
+	ja      mb_processing
+
+	; lensN-len2=idx
+	mov     [state + _lens + idx*4], DWORD(idx)
+	mov     r10, idx
+	or      r10, 0x4000     ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha256_ni_x1
+	; state and idx are intact
+	jmp     len_is_0
+
+mb_processing:
+
+	vpand   ymm2, ymm2, [rel clear_low_nibble]
+	vpshufd ymm2, ymm2, 0
+
+	vpsubd  ymm0, ymm0, ymm2
+	vpsubd  ymm1, ymm1, ymm2
+
+	vmovdqu [state + _lens + 0*32], ymm0
+	vmovdqu [state + _lens + 1*32], ymm1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha256_mb_x16_avx512
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	sub     num_lanes_inuse, 1
+	mov     [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+	vmovd   xmm0, [state + _args_digest + 4*idx + 0*4*16]
+	vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+	vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+	vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+	vmovd   xmm1, [state + _args_digest + 4*idx + 4*4*16]
+	vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+	vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+	vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+	vmovdqa [job_rax + _result_digest + 0*16], xmm0
+	vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*1]
+	mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     rbp, [rsp + _GPR_SAVE + 8*3]
+	mov     r12, [rsp + _GPR_SAVE + 8*4]
+	mov     r13, [rsp + _GPR_SAVE + 8*5]
+	mov     r14, [rsp + _GPR_SAVE + 8*6]
+	mov     r15, [rsp + _GPR_SAVE + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1:     dq  1
+lane_2:     dq  2
+lane_3:     dq  3
+lane_4:     dq  4
+lane_5:     dq  5
+lane_6:     dq  6
+lane_7:     dq  7
+lane_8:     dq  8
+lane_9:     dq  9
+lane_10:    dq  10
+lane_11:    dq  11
+lane_12:    dq  12
+lane_13:    dq  13
+lane_14:    dq  14
+lane_15:    dq  15
+
+ %else
+  %ifidn __OUTPUT_FORMAT__, win64
+   global no_sha256_mb_mgr_flush_avx512_ni
+   no_sha256_mb_mgr_flush_avx512_ni:
+  %endif
+ %endif ; HAVE_AS_KNOWS_SHANI
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+ global no_sha256_mb_mgr_flush_avx512_ni
+  no_sha256_mb_mgr_flush_avx512_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm
new file mode 100644
index 000000000..69ae4bad5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm
@@ -0,0 +1,254 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern  sha256_mb_x4_sse
+extern sha256_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rsi
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4            r8
+%define lens0           r8
+
+%define lens1           r9
+%define lens2           r10
+%define lens3           r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*3
+_ALIGN_SIZE     equ 0
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_sse(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_sse, function
+sha256_mb_mgr_flush_sse:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*2], rsi
+	movdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	movdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	movdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	movdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	movdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	movdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	movdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	movdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	movdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	movdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	; use num_lanes_inuse to judge all lanes are empty
+	cmp	dword [state + _num_lanes_inuse], 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+	cmp     qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [one]
+	cmp     qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [two]
+	cmp     qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [three]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     DWORD(lens2), [state + _lens + 2*4]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     DWORD(lens3), [state + _lens + 3*4]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+
+	; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+	cmp	dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_SSE
+	ja	mb_processing
+
+	; lensN-len2=idx
+	shr     len2, 4
+	mov     [state + _lens + idx*4], DWORD(idx)
+	mov	r10, idx
+	or	r10, 0x1000	; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha256_opt_x1
+	; state and idx are intact
+	jmp	len_is_0
+
+mb_processing:
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     [state + _lens + 2*4], DWORD(lens2)
+	mov     [state + _lens + 3*4], DWORD(lens3)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call     sha256_mb_x4_sse
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	movd    xmm0, [state + _args_digest + 4*idx + 0*16]
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
+	movd    xmm1, [state + _args_digest + 4*idx + 4*16]
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 5*16], 1
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 6*16], 2
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+	movdqa  [job_rax + _result_digest + 0*16], xmm0
+	movdqa  [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	movdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	movdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	movdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	movdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	movdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	movdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	movdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	movdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	movdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     r12, [rsp + _GPR_SAVE + 8*1]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+one:    dq  1
+two:    dq  2
+three:  dq  3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm
new file mode 100644
index 000000000..43b8fcbe4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm
@@ -0,0 +1,261 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+extern  sha256_mb_x4_sse
+extern sha256_ni_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rsi
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4            r8
+%define lens0           r8
+
+%define lens1           r9
+%define lens2           r10
+%define lens3           r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*3
+_ALIGN_SIZE     equ 0
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_sse_ni(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_sse_ni, function
+sha256_mb_mgr_flush_sse_ni:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*2], rsi
+	movdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	movdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	movdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	movdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	movdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	movdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	movdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	movdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	movdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	movdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	; use num_lanes_inuse to judge all lanes are empty
+	cmp     dword [state + _num_lanes_inuse], 0
+	jz      return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+	cmp     qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [one]
+	cmp     qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [two]
+	cmp     qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [three]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     DWORD(lens2), [state + _lens + 2*4]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     DWORD(lens3), [state + _lens + 3*4]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+
+	; compare with shani-sb threshold, if num_lanes_inuse <= threshold, using shani func
+	cmp     dword [state + _num_lanes_inuse], SHA256_NI_SB_THRESHOLD_SSE
+	ja      mb_processing
+
+	; lensN-len2=idx
+	shr     len2, 4
+	mov     [state + _lens + idx*4], DWORD(idx)
+	mov     r10, idx
+	or      r10, 0x1000     ; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha256_ni_x1
+	; state and idx are intact
+	jmp     len_is_0
+
+mb_processing:
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     [state + _lens + 2*4], DWORD(lens2)
+	mov     [state + _lens + 3*4], DWORD(lens3)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call     sha256_mb_x4_sse
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	movd    xmm0, [state + _args_digest + 4*idx + 0*16]
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
+	movd    xmm1, [state + _args_digest + 4*idx + 4*16]
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 5*16], 1
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 6*16], 2
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+	movdqa  [job_rax + _result_digest + 0*16], xmm0
+	movdqa  [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	movdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	movdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	movdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	movdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	movdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	movdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	movdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	movdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	movdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     r12, [rsp + _GPR_SAVE + 8*1]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+one:    dq  1
+two:    dq  2
+three:  dq  3
+
+%else
+ %ifidn __OUTPUT_FORMAT__, win64
+  global no_sha256_mb_mgr_flush_sse_ni
+  no_sha256_mb_mgr_flush_sse_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..903fb733b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+
+void sha256_mb_mgr_init_avx2(SHA256_MB_JOB_MGR * state)
+{
+	unsigned int j;
+	state->unused_lanes = 0xF76543210;
+	state->num_lanes_inuse = 0;
+	for (j = 0; j < SHA256_X8_LANES; j++) {
+		state->lens[j] = 0;
+		state->ldata[j].job_in_lane = 0;
+	}
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c
new file mode 100644
index 000000000..b875735f9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+
+void sha256_mb_mgr_init_avx512(SHA256_MB_JOB_MGR * state)
+{
+	unsigned int j;
+	state->unused_lanes = 0xfedcba9876543210;
+	state->num_lanes_inuse = 0;
+	for (j = 0; j < SHA256_MAX_LANES; j++) {
+		state->lens[j] = 0;
+		state->ldata[j].job_in_lane = 0;
+	}
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c
new file mode 100644
index 000000000..cf22c4aee
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+
+void sha256_mb_mgr_init_sse(SHA256_MB_JOB_MGR * state)
+{
+	unsigned int j;
+	state->unused_lanes = 0xF3210;
+	state->num_lanes_inuse = 0;
+	for (j = 0; j < SHA256_MIN_LANES; j++) {
+		state->lens[j] = 0;
+		state->ldata[j].job_in_lane = 0;
+	}
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm
new file mode 100644
index 000000000..cb7d5790a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm
@@ -0,0 +1,260 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x4_avx
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx             rdx ; rsi
+%define last_len        rdx ; rsi
+
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len        rsi
+%define idx             rsi
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+%define lens3           rbp
+
+%define extra_blocks    r8
+%define lens0           r8
+
+%define tmp             r9
+%define lens1           r9
+
+%define lane_data       r10
+%define lens2           r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE       16*10
+%define _GPR_SAVE       8*5
+%define STACK_SPACE     _GPR_SAVE + _XMM_SAVE
+
+; SHA256_JOB* sha256_mb_mgr_submit_avx(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_avx, function
+sha256_mb_mgr_submit_avx:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _XMM_SAVE + 8*0], rbx
+	mov     [rsp + _XMM_SAVE + 8*1], rbp
+	mov     [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _XMM_SAVE + 8*3], rsi
+	mov     [rsp + _XMM_SAVE + 8*4], rdi
+	vmovdqa  [rsp + 16*0], xmm6
+	vmovdqa  [rsp + 16*1], xmm7
+	vmovdqa  [rsp + 16*2], xmm8
+	vmovdqa  [rsp + 16*3], xmm9
+	vmovdqa  [rsp + 16*4], xmm10
+	vmovdqa  [rsp + 16*5], xmm11
+	vmovdqa  [rsp + 16*6], xmm12
+	vmovdqa  [rsp + 16*7], xmm13
+	vmovdqa  [rsp + 16*8], xmm14
+	vmovdqa  [rsp + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	movzx   lane, BYTE(unused_lanes)
+	and     lane, 0xF
+	shr     unused_lanes, 4
+	imul    lane_data, lane, _LANE_DATA_size
+	mov     dword [job + _status], STS_BEING_PROCESSED
+	lea     lane_data, [state + _ldata + lane_data]
+	mov     [state + _unused_lanes], unused_lanes
+	mov     DWORD(len), [job + _len]
+
+	shl	len, 4
+	or	len, lane
+
+	mov     [lane_data + _job_in_lane], job
+	mov     [state + _lens + 4*lane], DWORD(len)
+
+	; Load digest words from result_digest
+	vmovdqa	xmm0, [job + _result_digest + 0*16]
+	vmovdqa	xmm1, [job + _result_digest + 1*16]
+	vmovd    [state + _args_digest + 4*lane + 0*16], xmm0
+	vpextrd  [state + _args_digest + 4*lane + 1*16], xmm0, 1
+	vpextrd  [state + _args_digest + 4*lane + 2*16], xmm0, 2
+	vpextrd  [state + _args_digest + 4*lane + 3*16], xmm0, 3
+	vmovd    [state + _args_digest + 4*lane + 4*16], xmm1
+	vpextrd  [state + _args_digest + 4*lane + 5*16], xmm1, 1
+	vpextrd  [state + _args_digest + 4*lane + 6*16], xmm1, 2
+	vpextrd  [state + _args_digest + 4*lane + 7*16], xmm1, 3
+
+
+	mov     p, [job + _buffer]
+	mov     [state + _args_data_ptr + 8*lane], p
+
+	add	dword [state + _num_lanes_inuse], 1
+	cmp     unused_lanes, 0xF
+	jne     return_null
+
+start_loop:
+	; Find min length
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     DWORD(lens2), [state + _lens + 2*4]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     DWORD(lens3), [state + _lens + 3*4]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     [state + _lens + 2*4], DWORD(lens2)
+	mov     [state + _lens + 3*4], DWORD(lens3)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha256_mb_x4_avx
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     unused_lanes, [state + _unused_lanes]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub	dword [state + _num_lanes_inuse], 1
+
+	vmovd    xmm0, [state + _args_digest + 4*idx + 0*16]
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
+	vmovd    xmm1, [state + _args_digest + 4*idx + 4*16]
+	vpinsrd  xmm1, [state + _args_digest + 4*idx + 5*16], 1
+	vpinsrd  xmm1, [state + _args_digest + 4*idx + 6*16], 2
+	vpinsrd  xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6,  [rsp + 16*0]
+	vmovdqa  xmm7,  [rsp + 16*1]
+	vmovdqa  xmm8,  [rsp + 16*2]
+	vmovdqa  xmm9,  [rsp + 16*3]
+	vmovdqa  xmm10, [rsp + 16*4]
+	vmovdqa  xmm11, [rsp + 16*5]
+	vmovdqa  xmm12, [rsp + 16*6]
+	vmovdqa  xmm13, [rsp + 16*7]
+	vmovdqa  xmm14, [rsp + 16*8]
+	vmovdqa  xmm15, [rsp + 16*9]
+	mov     rsi, [rsp + _XMM_SAVE + 8*3]
+	mov     rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+	mov     rbx, [rsp + _XMM_SAVE + 8*0]
+	mov     rbp, [rsp + _XMM_SAVE + 8*1]
+	mov     r12, [rsp + _XMM_SAVE + 8*2]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+H0:     dd  0x6a09e667
+H1:     dd  0xbb67ae85
+H2:     dd  0x3c6ef372
+H3:     dd  0xa54ff53a
+H4:     dd  0x510e527f
+H5:     dd  0x9b05688c
+H6:     dd  0x1f83d9ab
+H7:     dd  0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..af2fc89ea
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm
@@ -0,0 +1,246 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "memcpy.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x8_avx2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define idx             r8
+%define last_len        r8
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+
+%define tmp             r9
+
+%define lane_data       r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE	8*8 + 16*10 + 8
+
+; SHA256_JOB* sha256_mb_mgr_submit_avx2(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_avx2, function
+sha256_mb_mgr_submit_avx2:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + 8*0], rbx
+	mov     [rsp + 8*3], rbp
+	mov     [rsp + 8*4], r12
+	mov     [rsp + 8*5], r13
+	mov     [rsp + 8*6], r14
+	mov     [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + 8*1], rsi
+	mov     [rsp + 8*2], rdi
+	vmovdqa  [rsp + 8*8 + 16*0], xmm6
+	vmovdqa  [rsp + 8*8 + 16*1], xmm7
+	vmovdqa  [rsp + 8*8 + 16*2], xmm8
+	vmovdqa  [rsp + 8*8 + 16*3], xmm9
+	vmovdqa  [rsp + 8*8 + 16*4], xmm10
+	vmovdqa  [rsp + 8*8 + 16*5], xmm11
+	vmovdqa  [rsp + 8*8 + 16*6], xmm12
+	vmovdqa  [rsp + 8*8 + 16*7], xmm13
+	vmovdqa  [rsp + 8*8 + 16*8], xmm14
+	vmovdqa  [rsp + 8*8 + 16*9], xmm15
+%endif
+	mov	unused_lanes, [state + _unused_lanes]
+	mov	lane, unused_lanes
+	and	lane, 0xF
+	shr	unused_lanes, 4
+	imul	lane_data, lane, _LANE_DATA_size
+	mov	dword [job + _status], STS_BEING_PROCESSED
+	lea	lane_data, [state + _ldata + lane_data]
+	mov	[state + _unused_lanes], unused_lanes
+	mov	DWORD(len), [job + _len]
+
+	shl	len, 4
+	or	len, lane
+	mov	[state + _lens + 4*lane], DWORD(len)
+
+	mov	[lane_data + _job_in_lane], job
+
+	; Load digest words from result_digest
+	vmovdqu	xmm0, [job + _result_digest + 0*16]
+	vmovdqu xmm1, [job + _result_digest + 1*16]
+	vmovd	[state + _args_digest + 4*lane + 0*4*8], xmm0
+	vpextrd	[state + _args_digest + 4*lane + 1*4*8], xmm0, 1
+	vpextrd	[state + _args_digest + 4*lane + 2*4*8], xmm0, 2
+	vpextrd	[state + _args_digest + 4*lane + 3*4*8], xmm0, 3
+	vmovd	[state + _args_digest + 4*lane + 4*4*8], xmm1
+	vpextrd	[state + _args_digest + 4*lane + 5*4*8], xmm1, 1
+	vpextrd	[state + _args_digest + 4*lane + 6*4*8], xmm1, 2
+	vpextrd	[state + _args_digest + 4*lane + 7*4*8], xmm1, 3
+
+
+	mov	p, [job + _buffer]
+	mov	[state + _args_data_ptr + 8*lane], p
+
+	add	dword [state + _num_lanes_inuse], 1
+	cmp	unused_lanes, 0xf
+	jne	return_null
+
+start_loop:
+	; Find min length
+	vmovdqa xmm0, [state + _lens + 0*16]
+	vmovdqa xmm1, [state + _lens + 1*16]
+
+	vpminud xmm2, xmm0, xmm1        ; xmm2 has {D,C,B,A}
+	vpalignr xmm3, xmm3, xmm2, 8    ; xmm3 has {x,x,D,C}
+	vpminud xmm2, xmm2, xmm3        ; xmm2 has {x,x,E,F}
+	vpalignr xmm3, xmm3, xmm2, 4    ; xmm3 has {x,x,x,E}
+	vpminud xmm2, xmm2, xmm3        ; xmm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+	vpand   xmm2, xmm2, [rel clear_low_nibble]
+	vpshufd xmm2, xmm2, 0
+
+	vpsubd  xmm0, xmm0, xmm2
+	vpsubd  xmm1, xmm1, xmm2
+
+	vmovdqa [state + _lens + 0*16], xmm0
+	vmovdqa [state + _lens + 1*16], xmm1
+
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha256_mb_x8_avx2
+
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	unused_lanes, [state + _unused_lanes]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+	sub	dword [state + _num_lanes_inuse], 1
+
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*4*8]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*4*8], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*4*8], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*4*8], 3
+	vmovd	xmm1, [state + _args_digest + 4*idx + 4*4*8]
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 5*4*8], 1
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 6*4*8], 2
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 7*4*8], 3
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+	vmovdqa	[job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + 8*8 + 16*0]
+	vmovdqa  xmm7, [rsp + 8*8 + 16*1]
+	vmovdqa  xmm8, [rsp + 8*8 + 16*2]
+	vmovdqa  xmm9, [rsp + 8*8 + 16*3]
+	vmovdqa  xmm10, [rsp + 8*8 + 16*4]
+	vmovdqa  xmm11, [rsp + 8*8 + 16*5]
+	vmovdqa  xmm12, [rsp + 8*8 + 16*6]
+	vmovdqa  xmm13, [rsp + 8*8 + 16*7]
+	vmovdqa  xmm14, [rsp + 8*8 + 16*8]
+	vmovdqa  xmm15, [rsp + 8*8 + 16*9]
+	mov     rsi, [rsp + 8*1]
+	mov     rdi, [rsp + 8*2]
+%endif
+	mov     rbx, [rsp + 8*0]
+	mov     rbp, [rsp + 8*3]
+	mov     r12, [rsp + 8*4]
+	mov     r13, [rsp + 8*5]
+	mov     r14, [rsp + 8*6]
+	mov     r15, [rsp + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..cdc477370
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm
@@ -0,0 +1,261 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "memcpy.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sha256_mb_x16_avx512
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define idx             r8
+%define last_len        r8
+%define p               r11
+%define start_offset    r11
+%define num_lanes_inuse r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+
+%define tmp             r9
+
+%define lane_data       r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE	8*8 + 16*10 + 8
+
+; SHA256_JOB* sha256_mb_mgr_submit_avx512(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_avx512, function
+sha256_mb_mgr_submit_avx512:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + 8*0], rbx
+	mov     [rsp + 8*3], rbp
+	mov     [rsp + 8*4], r12
+	mov     [rsp + 8*5], r13
+	mov     [rsp + 8*6], r14
+	mov     [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + 8*1], rsi
+	mov     [rsp + 8*2], rdi
+	vmovdqa  [rsp + 8*8 + 16*0], xmm6
+	vmovdqa  [rsp + 8*8 + 16*1], xmm7
+	vmovdqa  [rsp + 8*8 + 16*2], xmm8
+	vmovdqa  [rsp + 8*8 + 16*3], xmm9
+	vmovdqa  [rsp + 8*8 + 16*4], xmm10
+	vmovdqa  [rsp + 8*8 + 16*5], xmm11
+	vmovdqa  [rsp + 8*8 + 16*6], xmm12
+	vmovdqa  [rsp + 8*8 + 16*7], xmm13
+	vmovdqa  [rsp + 8*8 + 16*8], xmm14
+	vmovdqa  [rsp + 8*8 + 16*9], xmm15
+%endif
+	mov	unused_lanes, [state + _unused_lanes]
+	mov	lane, unused_lanes
+	and	lane, 0xF
+	shr	unused_lanes, 4
+	imul	lane_data, lane, _LANE_DATA_size
+	mov	dword [job + _status], STS_BEING_PROCESSED
+	lea	lane_data, [state + _ldata + lane_data]
+	mov	[state + _unused_lanes], unused_lanes
+	mov	DWORD(len), [job + _len]
+
+	shl	len, 4
+	or	len, lane
+	mov	[state + _lens + 4*lane], DWORD(len)
+
+	mov	[lane_data + _job_in_lane], job
+
+	; Load digest words from result_digest
+	vmovdqu	xmm0, [job + _result_digest + 0*16]
+	vmovdqu xmm1, [job + _result_digest + 1*16]
+	vmovd	[state + _args_digest + 4*lane + 0*4*16], xmm0
+	vpextrd	[state + _args_digest + 4*lane + 1*4*16], xmm0, 1
+	vpextrd	[state + _args_digest + 4*lane + 2*4*16], xmm0, 2
+	vpextrd	[state + _args_digest + 4*lane + 3*4*16], xmm0, 3
+	vmovd	[state + _args_digest + 4*lane + 4*4*16], xmm1
+	vpextrd	[state + _args_digest + 4*lane + 5*4*16], xmm1, 1
+	vpextrd	[state + _args_digest + 4*lane + 6*4*16], xmm1, 2
+	vpextrd	[state + _args_digest + 4*lane + 7*4*16], xmm1, 3
+
+
+	mov	p, [job + _buffer]
+	mov	[state + _args_data_ptr + 8*lane], p
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        add     num_lanes_inuse, 1
+	mov	[state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+        cmp     num_lanes_inuse, 16
+	jne	return_null
+
+start_loop:
+	; Find min length, ymm0 holds ahead 8, ymm1 holds rear 8
+	vmovdqu ymm0, [state + _lens + 0*32]
+	vmovdqu ymm1, [state + _lens + 1*32]
+
+	vpminud ymm2, ymm0, ymm1        ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+	vpalignr ymm3, ymm3, ymm2, 8    ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+	vpalignr ymm3, ymm3, ymm2, 4    ; ymm3 has {x,x, x,H2,x,x, x,D2}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x, x,G3,x,x, x,C3}
+	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has {x,x, x, x,x,x, x,C3}
+        vpminud ymm2, ymm2, ymm3        ; ymm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+        vpand   ymm2, ymm2, [rel clear_low_nibble]
+        vpshufd ymm2, ymm2, 0
+
+        vpsubd  ymm0, ymm0, ymm2
+        vpsubd  ymm1, ymm1, ymm2
+
+        vmovdqu [state + _lens + 0*32], ymm0
+        vmovdqu [state + _lens + 1*32], ymm1
+
+
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sha256_mb_x16_avx512
+
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	unused_lanes, [state + _unused_lanes]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+        mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        sub     num_lanes_inuse, 1
+        mov     [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*4*16]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+	vmovd	xmm1, [state + _args_digest + 4*idx + 4*4*16]
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+	vmovdqa	[job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + 8*8 + 16*0]
+	vmovdqa  xmm7, [rsp + 8*8 + 16*1]
+	vmovdqa  xmm8, [rsp + 8*8 + 16*2]
+	vmovdqa  xmm9, [rsp + 8*8 + 16*3]
+	vmovdqa  xmm10, [rsp + 8*8 + 16*4]
+	vmovdqa  xmm11, [rsp + 8*8 + 16*5]
+	vmovdqa  xmm12, [rsp + 8*8 + 16*6]
+	vmovdqa  xmm13, [rsp + 8*8 + 16*7]
+	vmovdqa  xmm14, [rsp + 8*8 + 16*8]
+	vmovdqa  xmm15, [rsp + 8*8 + 16*9]
+	mov     rsi, [rsp + 8*1]
+	mov     rdi, [rsp + 8*2]
+%endif
+	mov     rbx, [rsp + 8*0]
+	mov     rbp, [rsp + 8*3]
+	mov     r12, [rsp + 8*4]
+	mov     r13, [rsp + 8*5]
+	mov     r14, [rsp + 8*6]
+	mov     r15, [rsp + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=32
+
+align 32
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_mb_mgr_submit_avx512
+no_sha256_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm
new file mode 100644
index 000000000..b1bbc7002
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm
@@ -0,0 +1,261 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern  sha256_mb_x4_sse
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx             rdx ; rsi
+%define last_len        rdx ; rsi
+
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len        rsi
+%define idx             rsi
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+%define lens3           rbp
+
+%define extra_blocks    r8
+%define lens0           r8
+
+%define tmp             r9
+%define lens1           r9
+
+%define lane_data       r10
+%define lens2           r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE       16*10
+%define _GPR_SAVE       8*5
+%define STACK_SPACE     _GPR_SAVE + _XMM_SAVE
+
+; SHA256_JOB* sha256_mb_mgr_submit_sse(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_sse, function
+sha256_mb_mgr_submit_sse:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _XMM_SAVE + 8*0], rbx
+	mov     [rsp + _XMM_SAVE + 8*1], rbp
+	mov     [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _XMM_SAVE + 8*3], rsi
+	mov     [rsp + _XMM_SAVE + 8*4], rdi
+	movdqa  [rsp + 16*0], xmm6
+	movdqa  [rsp + 16*1], xmm7
+	movdqa  [rsp + 16*2], xmm8
+	movdqa  [rsp + 16*3], xmm9
+	movdqa  [rsp + 16*4], xmm10
+	movdqa  [rsp + 16*5], xmm11
+	movdqa  [rsp + 16*6], xmm12
+	movdqa  [rsp + 16*7], xmm13
+	movdqa  [rsp + 16*8], xmm14
+	movdqa  [rsp + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	movzx   lane, BYTE(unused_lanes)
+	and     lane, 0xF
+	shr     unused_lanes, 4
+	imul    lane_data, lane, _LANE_DATA_size
+	mov     dword [job + _status], STS_BEING_PROCESSED
+	lea     lane_data, [state + _ldata + lane_data]
+	mov     [state + _unused_lanes], unused_lanes
+	mov     DWORD(len), [job + _len]
+
+	shl	len, 4
+	or	len, lane
+
+	mov     [lane_data + _job_in_lane], job
+	mov     [state + _lens + 4*lane], DWORD(len)
+
+	; Load digest words from result_digest
+	movdqa	xmm0, [job + _result_digest + 0*16]
+	movdqa	xmm1, [job + _result_digest + 1*16]
+	movd    [state + _args_digest + 4*lane + 0*16], xmm0
+	pextrd  [state + _args_digest + 4*lane + 1*16], xmm0, 1
+	pextrd  [state + _args_digest + 4*lane + 2*16], xmm0, 2
+	pextrd  [state + _args_digest + 4*lane + 3*16], xmm0, 3
+	movd    [state + _args_digest + 4*lane + 4*16], xmm1
+	pextrd  [state + _args_digest + 4*lane + 5*16], xmm1, 1
+	pextrd  [state + _args_digest + 4*lane + 6*16], xmm1, 2
+	pextrd  [state + _args_digest + 4*lane + 7*16], xmm1, 3
+
+
+	mov     p, [job + _buffer]
+	mov     [state + _args_data_ptr + 8*lane], p
+
+	add	dword [state + _num_lanes_inuse], 1
+	cmp     unused_lanes, 0xF
+	jne     return_null
+
+start_loop:
+	; Find min length
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     DWORD(lens2), [state + _lens + 2*4]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     DWORD(lens3), [state + _lens + 3*4]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     [state + _lens + 2*4], DWORD(lens2)
+	mov     [state + _lens + 3*4], DWORD(lens3)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call     sha256_mb_x4_sse
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     unused_lanes, [state + _unused_lanes]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub	dword [state + _num_lanes_inuse], 1
+
+	movd    xmm0, [state + _args_digest + 4*idx + 0*16]
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
+	movd    xmm1, [state + _args_digest + 4*idx + 4*16]
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 5*16], 1
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 6*16], 2
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+	movdqa  [job_rax + _result_digest + 0*16], xmm0
+	movdqa  [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqa  xmm6,  [rsp + 16*0]
+	movdqa  xmm7,  [rsp + 16*1]
+	movdqa  xmm8,  [rsp + 16*2]
+	movdqa  xmm9,  [rsp + 16*3]
+	movdqa  xmm10, [rsp + 16*4]
+	movdqa  xmm11, [rsp + 16*5]
+	movdqa  xmm12, [rsp + 16*6]
+	movdqa  xmm13, [rsp + 16*7]
+	movdqa  xmm14, [rsp + 16*8]
+	movdqa  xmm15, [rsp + 16*9]
+	mov     rsi, [rsp + _XMM_SAVE + 8*3]
+	mov     rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+	mov     rbx, [rsp + _XMM_SAVE + 8*0]
+	mov     rbp, [rsp + _XMM_SAVE + 8*1]
+	mov     r12, [rsp + _XMM_SAVE + 8*2]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+
+section .data align=16
+
+align 16
+H0:     dd  0x6a09e667
+H1:     dd  0xbb67ae85
+H2:     dd  0x3c6ef372
+H3:     dd  0xa54ff53a
+H4:     dd  0x510e527f
+H5:     dd  0x9b05688c
+H6:     dd  0x1f83d9ab
+H7:     dd  0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm
new file mode 100644
index 000000000..cb1dce641
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm
@@ -0,0 +1,301 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+extern  sha256_mb_x4_sse
+extern  sha256_ni_x2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx             rdx ; rsi
+%define last_len        rdx ; rsi
+
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len        rsi
+%define idx             rsi
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+%define lens3           rbp
+
+%define extra_blocks    r8
+%define lens0           r8
+
+%define tmp             r9
+%define lens1           r9
+
+%define lane_data       r10
+%define lens2           r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE       16*10
+%define _GPR_SAVE       8*7
+%define STACK_SPACE     _GPR_SAVE + _XMM_SAVE
+
+; SHA256_JOB* sha256_mb_mgr_submit_sse_ni(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_sse_ni, function
+sha256_mb_mgr_submit_sse_ni:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _XMM_SAVE + 8*0], rbx
+	mov     [rsp + _XMM_SAVE + 8*1], rbp
+	mov     [rsp + _XMM_SAVE + 8*2], r12
+	mov     [rsp + _XMM_SAVE + 8*5], r13
+	mov     [rsp + _XMM_SAVE + 8*6], r14
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _XMM_SAVE + 8*3], rsi
+	mov     [rsp + _XMM_SAVE + 8*4], rdi
+	movdqa  [rsp + 16*0], xmm6
+	movdqa  [rsp + 16*1], xmm7
+	movdqa  [rsp + 16*2], xmm8
+	movdqa  [rsp + 16*3], xmm9
+	movdqa  [rsp + 16*4], xmm10
+	movdqa  [rsp + 16*5], xmm11
+	movdqa  [rsp + 16*6], xmm12
+	movdqa  [rsp + 16*7], xmm13
+	movdqa  [rsp + 16*8], xmm14
+	movdqa  [rsp + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	movzx   lane, BYTE(unused_lanes)
+	and     lane, 0xF
+	shr     unused_lanes, 4
+	imul    lane_data, lane, _LANE_DATA_size
+	mov     dword [job + _status], STS_BEING_PROCESSED
+	lea     lane_data, [state + _ldata + lane_data]
+	mov     [state + _unused_lanes], unused_lanes
+	mov     DWORD(len), [job + _len]
+
+	shl     len, 4
+	or      len, lane
+
+	mov     [lane_data + _job_in_lane], job
+	mov     [state + _lens + 4*lane], DWORD(len)
+
+	; Load digest words from result_digest
+	movdqa  xmm0, [job + _result_digest + 0*16]
+	movdqa  xmm1, [job + _result_digest + 1*16]
+	movd    [state + _args_digest + 4*lane + 0*16], xmm0
+	pextrd  [state + _args_digest + 4*lane + 1*16], xmm0, 1
+	pextrd  [state + _args_digest + 4*lane + 2*16], xmm0, 2
+	pextrd  [state + _args_digest + 4*lane + 3*16], xmm0, 3
+	movd    [state + _args_digest + 4*lane + 4*16], xmm1
+	pextrd  [state + _args_digest + 4*lane + 5*16], xmm1, 1
+	pextrd  [state + _args_digest + 4*lane + 6*16], xmm1, 2
+	pextrd  [state + _args_digest + 4*lane + 7*16], xmm1, 3
+
+	mov     p, [job + _buffer]
+	mov     [state + _args_data_ptr + 8*lane], p
+
+	add     dword [state + _num_lanes_inuse], 1
+
+	cmp     unused_lanes, 0xF32	; we will process two jobs at the same time
+	jne 	return_null		; wait for another sha_ni job
+
+	; compare with shani-sb threshold, if num_lanes_sse <= threshold, using shani func
+  %if SHA256_NI_SB_THRESHOLD_SSE >= 4   ; there are 4 lanes in sse mb
+  ; shani glue code
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+	; lensN-len2=idx
+	sub     lens0, len2
+	sub     lens1, len2
+
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     r10, idx
+	or      r10, 0x1000     ; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+	; "state" and "args" are the same address, arg1
+	; len is arg2, idx and nlane in r10
+	call    sha256_ni_x2
+	; state and idx are intact
+  %else
+  ; original mb code
+	cmp     unused_lanes, 0xF
+	jne     return_null
+
+    start_loop:
+	; Find min length
+	mov     DWORD(lens0), [state + _lens + 0*4]
+	mov     idx, lens0
+	mov     DWORD(lens1), [state + _lens + 1*4]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     DWORD(lens2), [state + _lens + 2*4]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     DWORD(lens3), [state + _lens + 3*4]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xF
+	jz      len_is_0
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 4
+	mov     [state + _lens + 0*4], DWORD(lens0)
+	mov     [state + _lens + 1*4], DWORD(lens1)
+	mov     [state + _lens + 2*4], DWORD(lens2)
+	mov     [state + _lens + 3*4], DWORD(lens3)
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call     sha256_mb_x4_sse
+	; state and idx are intact
+  %endif
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     unused_lanes, [state + _unused_lanes]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	shl     unused_lanes, 4
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	movd    xmm0, [state + _args_digest + 4*idx + 0*16]
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 1*16], 1
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 2*16], 2
+	pinsrd  xmm0, [state + _args_digest + 4*idx + 3*16], 3
+	movd    xmm1, [state + _args_digest + 4*idx + 4*16]
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 5*16], 1
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 6*16], 2
+	pinsrd  xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+	movdqa  [job_rax + _result_digest + 0*16], xmm0
+	movdqa  [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqa  xmm6,  [rsp + 16*0]
+	movdqa  xmm7,  [rsp + 16*1]
+	movdqa  xmm8,  [rsp + 16*2]
+	movdqa  xmm9,  [rsp + 16*3]
+	movdqa  xmm10, [rsp + 16*4]
+	movdqa  xmm11, [rsp + 16*5]
+	movdqa  xmm12, [rsp + 16*6]
+	movdqa  xmm13, [rsp + 16*7]
+	movdqa  xmm14, [rsp + 16*8]
+	movdqa  xmm15, [rsp + 16*9]
+	mov     rsi, [rsp + _XMM_SAVE + 8*3]
+	mov     rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+	mov     rbx, [rsp + _XMM_SAVE + 8*0]
+	mov     rbp, [rsp + _XMM_SAVE + 8*1]
+	mov     r12, [rsp + _XMM_SAVE + 8*2]
+	mov     r13, [rsp + _XMM_SAVE + 8*5]
+	mov     r14, [rsp + _XMM_SAVE + 8*6]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+H0:     dd  0x6a09e667
+H1:     dd  0xbb67ae85
+H2:     dd  0x3c6ef372
+H3:     dd  0xa54ff53a
+H4:     dd  0x510e527f
+H5:     dd  0x9b05688c
+H6:     dd  0x1f83d9ab
+H7:     dd  0x5be0cd19
+
+%else
+ %ifidn __OUTPUT_FORMAT__, win64
+  global no_sha256_mb_mgr_submit_sse_ni
+  no_sha256_mb_mgr_submit_sse_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c
new file mode 100644
index 000000000..768bfca78
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c
@@ -0,0 +1,160 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha256_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS];
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	SHA256_HASH_CTX_MGR *mgr = NULL;
+	SHA256_HASH_CTX ctxpool[TEST_BUFS];
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t i, j, fail = 0;
+	uint32_t lens[TEST_BUFS];
+	unsigned int jobs, t;
+	int ret;
+
+	printf("multibinary_sha256 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+	       TEST_LEN);
+
+	srand(TEST_SEED);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha256_ctx_mgr_init(mgr);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocate and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// SSL test
+		SHA256(bufs[i], TEST_LEN, digest_ssl[i]);
+
+		// sb_sha256 test
+		sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+	}
+
+	while (sha256_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_be32(((uint32_t *) digest_ssl[i])[j])) {
+				fail++;
+				printf("Test%d, digest%d fail %08X <=> %08X\n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       to_be32(((uint32_t *) digest_ssl[i])[j]));
+			}
+		}
+	}
+	putchar('.');
+
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		sha256_ctx_mgr_init(mgr);
+
+		for (i = 0; i < jobs; i++) {
+			// Random buffer with random len and contents
+			lens[i] = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], lens[i]);
+
+			// Run SSL test
+			SHA256(bufs[i], lens[i], digest_ssl[i]);
+
+			// Run sb_sha256 test
+			sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+		}
+
+		while (sha256_ctx_mgr_flush(mgr)) ;
+
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] !=
+				    to_be32(((uint32_t *) digest_ssl[i])[j])) {
+					fail++;
+					printf("Test%d, digest%d fail %08X <=> %08X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       to_be32(((uint32_t *) digest_ssl[i])[j]));
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha256_ssl rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c
new file mode 100644
index 000000000..adba77f3d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c
@@ -0,0 +1,203 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha256_mb.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	SHA256_HASH_CTX_MGR *mgr = NULL;
+	SHA256_HASH_CTX ctxpool[TEST_BUFS];
+	uint32_t i, j, fail = 0;
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t lens[TEST_BUFS];
+	unsigned int jobs, t;
+	uint8_t *tmp_buf;
+	int ret;
+
+	printf("multibinary_sha256 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+	       TEST_LEN);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha256_ctx_mgr_init(mgr);
+
+	srand(TEST_SEED);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocate  and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contexts
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// Run reference test
+		sha256_ref(bufs[i], digest_ref[i], TEST_LEN);
+
+		// Run sb_sha256 test
+		sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+	}
+
+	while (sha256_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("Test%d fixed size, digest%d "
+				       "fail 0x%08X <=> 0x%08X \n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+
+	if (fail) {
+		printf("Test failed function check %d\n", fail);
+		return fail;
+	}
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		sha256_ctx_mgr_init(mgr);
+
+		for (i = 0; i < jobs; i++) {
+			// Use buffer with random len and contents
+			lens[i] = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], lens[i]);
+
+			// Run reference test
+			sha256_ref(bufs[i], digest_ref[i], lens[i]);
+
+			// Run sha256_mb test
+			sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+		}
+
+		while (sha256_ctx_mgr_flush(mgr)) ;
+
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+					fail++;
+					printf("Test%d, digest%d fail "
+					       "0x%08X <=> 0x%08X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       digest_ref[i][j]);
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	// Test at the end of buffer
+	jobs = rand() % TEST_BUFS;
+	tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+	if (!tmp_buf) {
+		printf("malloc failed, end test aborted.\n");
+		return 1;
+	}
+
+	rand_buffer(tmp_buf, jobs);
+
+	sha256_ctx_mgr_init(mgr);
+
+	// Extend to the end of allocated buffer to construct jobs
+	for (i = 0; i < jobs; i++) {
+		bufs[i] = (uint8_t *) & tmp_buf[i];
+		lens[i] = jobs - i;
+
+		// Reference test
+		sha256_ref(bufs[i], digest_ref[i], lens[i]);
+
+		// sb_sha256 test
+		sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+	}
+
+	while (sha256_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < jobs; i++) {
+		for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("End test failed at offset %d - result: 0x%08X"
+				       ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+
+	putchar('.');
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha256 rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c
new file mode 100644
index 000000000..9535d80df
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c
@@ -0,0 +1,300 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha256_mb.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE		13*SHA256_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS 	(TEST_LEN/(16*SHA256_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS];
+
+extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	SHA256_HASH_CTX_MGR *mgr = NULL;
+	SHA256_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+	uint32_t i, j, fail = 0;
+	int len_done, len_rem, len_rand;
+	unsigned char *bufs[TEST_BUFS];
+	unsigned char *buf_ptr[TEST_BUFS];
+	uint32_t lens[TEST_BUFS];
+	unsigned int joblen, jobs, t;
+	int ret;
+
+	printf("multibinary_sha256_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+	       TEST_LEN);
+
+	srand(TEST_SEED);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha256_ctx_mgr_init(mgr);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocte and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		buf_ptr[i] = bufs[i];
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// Run reference test
+		sha256_ref(bufs[i], digest_ref[i], TEST_LEN);
+	}
+
+	// Run sb_sha256 tests
+	for (i = 0; i < TEST_BUFS;) {
+		len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+		len_rem = TEST_LEN - len_done;
+
+		if (len_done == 0)
+			ctx = sha256_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+		else if (len_rem <= UPDATE_SIZE)
+			ctx = sha256_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], len_rem, HASH_LAST);
+		else
+			ctx = sha256_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+		// Add jobs while available or finished
+		if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+			i++;
+			continue;
+		}
+		// Resubmit unfinished job
+		i = (unsigned long)(ctx->user_data);
+		buf_ptr[i] += UPDATE_SIZE;
+	}
+
+	// Start flushing finished jobs, end on last flushed
+	ctx = sha256_ctx_mgr_flush(mgr);
+	while (ctx) {
+		if (hash_ctx_complete(ctx)) {
+			debug_char('-');
+			ctx = sha256_ctx_mgr_flush(mgr);
+			continue;
+		}
+		// Resubmit unfinished job
+		i = (unsigned long)(ctx->user_data);
+		buf_ptr[i] += UPDATE_SIZE;
+
+		len_done = (int)((unsigned long)buf_ptr[i]
+				 - (unsigned long)bufs[i]);
+		len_rem = TEST_LEN - len_done;
+
+		if (len_rem <= UPDATE_SIZE)
+			ctx = sha256_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], len_rem, HASH_LAST);
+		else
+			ctx = sha256_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+		if (ctx == NULL)
+			ctx = sha256_ctx_mgr_flush(mgr);
+	}
+
+	// Check digests
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("Test%d fixed size, digest%d fail %8X <=> %8X",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+	putchar('.');
+
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		for (i = 0; i < jobs; i++) {
+			joblen = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], joblen);
+			lens[i] = joblen;
+			buf_ptr[i] = bufs[i];
+			sha256_ref(bufs[i], digest_ref[i], lens[i]);
+		}
+
+		sha256_ctx_mgr_init(mgr);
+
+		// Run sha256_sb jobs
+		i = 0;
+		while (i < jobs) {
+			// Submit a new job
+			len_rand = SHA256_BLOCK_SIZE +
+			    SHA256_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+			if (lens[i] > len_rand)
+				ctx = sha256_ctx_mgr_submit(mgr,
+							    &ctxpool[i],
+							    buf_ptr[i], len_rand, HASH_FIRST);
+			else
+				ctx = sha256_ctx_mgr_submit(mgr,
+							    &ctxpool[i],
+							    buf_ptr[i], lens[i], HASH_ENTIRE);
+
+			// Returned ctx could be:
+			//  - null context (we are just getting started and lanes aren't full yet), or
+			//  - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+			//  - an unfinished ctx, we will resubmit
+
+			if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+				i++;
+				continue;
+			} else {
+				// unfinished ctx returned, choose another random update length and submit either
+				// UPDATE or LAST depending on the amount of buffer remaining
+				while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+					j = (unsigned long)(ctx->user_data);	// Get index of the returned ctx
+					buf_ptr[j] = bufs[j] + ctx->total_length;
+					len_rand = (rand() % SHA256_BLOCK_SIZE)
+					    * (rand() % MAX_RAND_UPDATE_BLOCKS);
+					len_rem = lens[j] - ctx->total_length;
+
+					if (len_rem <= len_rand)	// submit the rest of the job as LAST
+						ctx = sha256_ctx_mgr_submit(mgr,
+									    &ctxpool[j],
+									    buf_ptr[j],
+									    len_rem,
+									    HASH_LAST);
+					else	// submit the random update length as UPDATE
+						ctx = sha256_ctx_mgr_submit(mgr,
+									    &ctxpool[j],
+									    buf_ptr[j],
+									    len_rand,
+									    HASH_UPDATE);
+				}	// Either continue submitting any contexts returned here as UPDATE/LAST, or
+				// go back to submitting new jobs using the index i.
+
+				i++;
+			}
+		}
+
+		// Start flushing finished jobs, end on last flushed
+		ctx = sha256_ctx_mgr_flush(mgr);
+		while (ctx) {
+			if (hash_ctx_complete(ctx)) {
+				debug_char('-');
+				ctx = sha256_ctx_mgr_flush(mgr);
+				continue;
+			}
+			// Resubmit unfinished job
+			i = (unsigned long)(ctx->user_data);
+			buf_ptr[i] = bufs[i] + ctx->total_length;	// update buffer pointer
+			len_rem = lens[i] - ctx->total_length;
+			len_rand = (rand() % SHA256_BLOCK_SIZE)
+			    * (rand() % MAX_RAND_UPDATE_BLOCKS);
+			debug_char('+');
+			if (len_rem <= len_rand)
+				ctx = sha256_ctx_mgr_submit(mgr,
+							    &ctxpool[i],
+							    buf_ptr[i], len_rem, HASH_LAST);
+			else
+				ctx = sha256_ctx_mgr_submit(mgr,
+							    &ctxpool[i],
+							    buf_ptr[i], len_rand, HASH_UPDATE);
+
+			if (ctx == NULL)
+				ctx = sha256_ctx_mgr_flush(mgr);
+		}
+
+		// Check result digest
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+					fail++;
+					printf("Test%d, digest%d fail %8X <=> %8X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       digest_ref[i][j]);
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha256_update rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c
new file mode 100644
index 000000000..8a5b5a9b2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c
@@ -0,0 +1,241 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sha256_mb.h"
+
+typedef uint32_t DigestSHA256[SHA256_DIGEST_NWORDS];
+
+#define MSGS 7
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+static uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq";
+static uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO";
+static uint8_t msg3[] =
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<";
+static uint8_t msg4[] =
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR";
+static uint8_t msg5[] =
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?";
+static uint8_t msg6[] =
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+    "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU";
+static uint8_t msg7[] = "";
+
+static DigestSHA256 expResultDigest1 = { 0x248D6A61, 0xD20638B8, 0xE5C02693, 0x0C3E6039,
+	0xA33CE459, 0x64FF2167, 0xF6ECEDD4, 0x19DB06C1
+};
+
+static DigestSHA256 expResultDigest2 = { 0xD9C2E699, 0x586B948F, 0x4022C799, 0x4FFE14C6,
+	0x3A4E8E31, 0x2EE2AEE1, 0xEBE51BED, 0x85705CFD
+};
+
+static DigestSHA256 expResultDigest3 = { 0xE3057651, 0x81295681, 0x7ECF1791, 0xFF9A1619,
+	0xB2BC5CAD, 0x2AC00018, 0x92AE489C, 0x48DD10B3
+};
+
+static DigestSHA256 expResultDigest4 = { 0x0307DAA3, 0x7130A140, 0x270790F9, 0x95B71407,
+	0x8EC752A6, 0x084EC1F3, 0xBD873D79, 0x3FF78383
+};
+
+static DigestSHA256 expResultDigest5 = { 0x679312F7, 0x2E18D599, 0x5F51BDC6, 0x4ED56AFD,
+	0x9B5704D3, 0x4387E11C, 0xC2331089, 0x2CD45DAA
+};
+
+static DigestSHA256 expResultDigest6 = { 0x8B1767E9, 0x7BA7BBE5, 0xF9A6E8D9, 0x9996904F,
+	0x3AF6562E, 0xA58AF438, 0x5D8D584B, 0x81C808CE
+};
+
+static DigestSHA256 expResultDigest7 = { 0xE3B0C442, 0x98FC1C14, 0x9AFBF4C8, 0x996FB924,
+	0x27AE41E4, 0x649B934C, 0xA495991B, 0x7852B855
+};
+
+static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 };
+
+static uint32_t *expResultDigest[MSGS] = {
+	expResultDigest1, expResultDigest2, expResultDigest3,
+	expResultDigest4, expResultDigest5, expResultDigest6,
+	expResultDigest7
+};
+
+int main(void)
+{
+	SHA256_HASH_CTX_MGR *mgr = NULL;
+	SHA256_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+	uint32_t i, j, k, t, checked = 0;
+	uint32_t *good;
+	int ret;
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha256_ctx_mgr_init(mgr);
+
+	// Init contexts before first use
+	for (i = 0; i < MSGS; i++) {
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	for (i = 0; i < MSGS; i++) {
+		ctx = sha256_ctx_mgr_submit(mgr,
+					    &ctxpool[i],
+					    msgs[i], strlen((char *)msgs[i]), HASH_ENTIRE);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			good = expResultDigest[t];
+			checked++;
+			for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the submit."
+				       " Error code: %d", ctx->error);
+				return -1;
+			}
+
+		}
+	}
+
+	while (1) {
+		ctx = sha256_ctx_mgr_flush(mgr);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			good = expResultDigest[t];
+			checked++;
+			for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the submit."
+				       " Error code: %d", ctx->error);
+				return -1;
+			}
+		} else {
+			break;
+		}
+	}
+
+	// do larger test in pseudo-random order
+
+	// Init contexts before first use
+	for (i = 0; i < NUM_JOBS; i++) {
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	checked = 0;
+	for (i = 0; i < NUM_JOBS; i++) {
+		j = PSEUDO_RANDOM_NUM(i);
+		ctx = sha256_ctx_mgr_submit(mgr,
+					    &ctxpool[i],
+					    msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+			good = expResultDigest[k];
+			checked++;
+			for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the"
+				       " submit. Error code: %d", ctx->error);
+				return -1;
+			}
+
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+		}
+	}
+	while (1) {
+		ctx = sha256_ctx_mgr_flush(mgr);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+			good = expResultDigest[k];
+			checked++;
+			for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the submit."
+				       " Error code: %d", ctx->error);
+				return -1;
+			}
+		} else {
+			break;
+		}
+	}
+
+	if (checked != NUM_JOBS) {
+		printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+		return -1;
+	}
+
+	printf(" multibinary_sha256 test: Pass\n");
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..51759d7a8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c
@@ -0,0 +1,129 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha256_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+#  define TEST_LEN     4*1024
+#  define TEST_LOOPS   4000
+#  define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     (GT_L3_CACHE / TEST_BUFS)
+#  define TEST_LOOPS   20
+#  define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS];
+
+int main(void)
+{
+	SHA256_HASH_CTX_MGR *mgr = NULL;
+	SHA256_HASH_CTX ctxpool[TEST_BUFS];
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t i, j, t, fail = 0;
+	struct perf start, stop;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+		if (bufs[i] == NULL) {
+			printf("calloc failed test aborted\n");
+			return 1;
+		}
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+	if (ret) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	sha256_ctx_mgr_init(mgr);
+
+	// Start OpenSSL tests
+	perf_start(&start);
+	for (t = 0; t < TEST_LOOPS; t++) {
+		for (i = 0; i < TEST_BUFS; i++)
+			SHA256(bufs[i], TEST_LEN, digest_ssl[i]);
+	}
+	perf_stop(&stop);
+
+	printf("sha256_openssl" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+	// Start mb tests
+	perf_start(&start);
+	for (t = 0; t < TEST_LOOPS; t++) {
+		for (i = 0; i < TEST_BUFS; i++)
+			sha256_ctx_mgr_submit(mgr,
+					      &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+		while (sha256_ctx_mgr_flush(mgr)) ;
+	}
+	perf_stop(&stop);
+
+	printf("multibinary_sha256" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_be32(((uint32_t *) digest_ssl[i])[j])) {
+				fail++;
+				printf("Test%d, digest%d fail %08X <=> %08X\n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       to_be32(((uint32_t *) digest_ssl[i])[j]));
+			}
+		}
+	}
+
+	printf("Multi-buffer sha256 test complete %d buffers of %d B with "
+	       "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha256_ossl_perf: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c
new file mode 100644
index 000000000..235ec74a8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c
@@ -0,0 +1,132 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha256_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS SHA256_MAX_LANES
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+#  define TEST_LEN     4*1024
+#  define TEST_LOOPS   10000
+#  define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     (GT_L3_CACHE / TEST_BUFS)
+#  define TEST_LOOPS   100
+#  define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS];
+
+int main(void)
+{
+	SHA256_HASH_CTX_MGR *mgr = NULL;
+	SHA256_HASH_CTX ctxpool[TEST_BUFS];
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t i, j, t, fail = 0;
+	uint32_t nlanes;
+	struct perf start, stop;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+		if (bufs[i] == NULL) {
+			printf("calloc failed test aborted\n");
+			return 1;
+		}
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+	if (ret) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	sha256_ctx_mgr_init(mgr);
+
+	// Start OpenSSL tests
+	perf_start(&start);
+	for (t = 0; t < TEST_LOOPS; t++) {
+		for (i = 0; i < TEST_BUFS; i++)
+			SHA256(bufs[i], TEST_LEN, digest_ssl[i]);
+	}
+	perf_stop(&stop);
+
+	printf("sha256_openssl" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+	// Start mb shortage tests
+	for (nlanes = TEST_BUFS; nlanes > 0; nlanes--) {
+		perf_start(&start);
+		for (t = 0; t < TEST_LOOPS; t++) {
+			for (i = 0; i < nlanes; i++)
+				sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN,
+						      HASH_ENTIRE);
+
+			while (sha256_ctx_mgr_flush(mgr)) ;
+		}
+		perf_stop(&stop);
+
+		printf("multibinary_sha256" TEST_TYPE_STR " with %d lanes: ", nlanes);
+		perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+		for (i = 0; i < nlanes; i++) {
+			for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] !=
+				    to_be32(((uint32_t *) digest_ssl[i])[j])) {
+					fail++;
+					printf("Test%d, digest%d fail %08X <=> %08X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       to_be32(((uint32_t *) digest_ssl[i])[j]));
+				}
+			}
+		}
+	}
+
+	printf("Multi-buffer sha256 test complete %d buffers of %d B with "
+	       "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha256_ossl_perf: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm
new file mode 100644
index 000000000..f45669c6e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm
@@ -0,0 +1,930 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute oct SHA256 using SSE-256 / AVX512
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; zmm0-31
+;; Windows clobbers:  rax rbx     rdx rsi rdi        r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves:         rcx             rbp r8
+;;
+;; Linux clobbers:    rax rbx rcx rdx rsi            r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves:                       rdi rbp r8
+;;
+;; clobbers zmm0-31
+
+%define APPEND(a,b) a %+ b
+
+; Define Stack Layout
+START_FIELDS
+;;;     name            size    align
+FIELD	_DIGEST_SAVE,	8*64,	64
+FIELD	_rsp,		8,	8
+%assign STACK_SPACE	_FIELD_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+   %define arg1 rcx	; arg0 preserved
+   %define arg2 rdx	; arg1
+   %define reg3 r8	; arg2 preserved
+   %define reg4 r9	; arg3
+   %define var1 rdi
+   %define var2 rsi
+   %define local_func_decl(func_name) global func_name
+ %else
+   %define arg1 rdi	; arg0
+   %define arg2 rsi	; arg1
+   %define var1 rdx	; arg2
+   %define var2 rcx	; arg3
+   %define local_func_decl(func_name) mk_global func_name, function, internal
+%endif
+
+%define state    arg1
+%define num_blks arg2
+
+%define	IN	(state + _data_ptr)
+%define DIGEST	state
+%define SIZE	num_blks
+
+%define IDX  var1
+%define TBL  var2
+
+%define A	zmm0
+%define B	zmm1
+%define C	zmm2
+%define D	zmm3
+%define E	zmm4
+%define F	zmm5
+%define G	zmm6
+%define H	zmm7
+%define T1	zmm8
+%define TMP0	zmm9
+%define TMP1	zmm10
+%define TMP2	zmm11
+%define TMP3	zmm12
+%define TMP4	zmm13
+%define TMP5	zmm14
+%define TMP6	zmm15
+
+%define W0	zmm16
+%define W1	zmm17
+%define W2	zmm18
+%define W3	zmm19
+%define W4	zmm20
+%define W5	zmm21
+%define W6	zmm22
+%define W7	zmm23
+%define W8	zmm24
+%define W9	zmm25
+%define W10	zmm26
+%define W11	zmm27
+%define W12	zmm28
+%define W13	zmm29
+%define W14	zmm30
+%define W15	zmm31
+
+%define inp0	r9
+%define inp1	r10
+%define inp2	r11
+%define inp3	r12
+%define inp4	r13
+%define inp5	r14
+%define inp6	r15
+%define inp7	rax
+
+%macro TRANSPOSE16 18
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%t0 %17
+%define %%t1 %18
+
+; r0  = {a15 a14 a13 a12   a11 a10 a9 a8   a7 a6 a5 a4   a3 a2 a1 a0}
+; r1  = {b15 b14 b13 b12   b11 b10 b9 b8   b7 b6 b5 b4   b3 b2 b1 b0}
+; r2  = {c15 c14 c13 c12   c11 c10 c9 c8   c7 c6 c5 c4   c3 c2 c1 c0}
+; r3  = {d15 d14 d13 d12   d11 d10 d9 d8   d7 d6 d5 d4   d3 d2 d1 d0}
+; r4  = {e15 e14 e13 e12   e11 e10 e9 e8   e7 e6 e5 e4   e3 e2 e1 e0}
+; r5  = {f15 f14 f13 f12   f11 f10 f9 f8   f7 f6 f5 f4   f3 f2 f1 f0}
+; r6  = {g15 g14 g13 g12   g11 g10 g9 g8   g7 g6 g5 g4   g3 g2 g1 g0}
+; r7  = {h15 h14 h13 h12   h11 h10 h9 h8   h7 h6 h5 h4   h3 h2 h1 h0}
+; r8  = {i15 i14 i13 i12   i11 i10 i9 i8   i7 i6 i5 i4   i3 i2 i1 i0}
+; r9  = {j15 j14 j13 j12   j11 j10 j9 j8   j7 j6 j5 j4   j3 j2 j1 j0}
+; r10 = {k15 k14 k13 k12   k11 k10 k9 k8   k7 k6 k5 k4   k3 k2 k1 k0}
+; r11 = {l15 l14 l13 l12   l11 l10 l9 l8   l7 l6 l5 l4   l3 l2 l1 l0}
+; r12 = {m15 m14 m13 m12   m11 m10 m9 m8   m7 m6 m5 m4   m3 m2 m1 m0}
+; r13 = {n15 n14 n13 n12   n11 n10 n9 n8   n7 n6 n5 n4   n3 n2 n1 n0}
+; r14 = {o15 o14 o13 o12   o11 o10 o9 o8   o7 o6 o5 o4   o3 o2 o1 o0}
+; r15 = {p15 p14 p13 p12   p11 p10 p9 p8   p7 p6 p5 p4   p3 p2 p1 p0}
+
+; r0   = {p0  o0  n0  m0    l0  k0  j0  i0    h0  g0  f0  e0    d0  c0  b0  a0}
+; r1   = {p1  o1  n1  m1    l1  k1  j1  i1    h1  g1  f1  e1    d1  c1  b1  a1}
+; r2   = {p2  o2  n2  m2    l2  k2  j2  i2    h2  g2  f2  e2    d2  c2  b2  a2}
+; r3   = {p3  o3  n3  m3    l3  k3  j3  i3    h3  g3  f3  e3    d3  c3  b3  a3}
+; r4   = {p4  o4  n4  m4    l4  k4  j4  i4    h4  g4  f4  e4    d4  c4  b4  a4}
+; r5   = {p5  o5  n5  m5    l5  k5  j5  i5    h5  g5  f5  e5    d5  c5  b5  a5}
+; r6   = {p6  o6  n6  m6    l6  k6  j6  i6    h6  g6  f6  e6    d6  c6  b6  a6}
+; r7   = {p7  o7  n7  m7    l7  k7  j7  i7    h7  g7  f7  e7    d7  c7  b7  a7}
+; r8   = {p8  o8  n8  m8    l8  k8  j8  i8    h8  g8  f8  e8    d8  c8  b8  a8}
+; r9   = {p9  o9  n9  m9    l9  k9  j9  i9    h9  g9  f9  e9    d9  c9  b9  a9}
+; r10  = {p10 o10 n10 m10   l10 k10 j10 i10   h10 g10 f10 e10   d10 c10 b10 a10}
+; r11  = {p11 o11 n11 m11   l11 k11 j11 i11   h11 g11 f11 e11   d11 c11 b11 a11}
+; r12  = {p12 o12 n12 m12   l12 k12 j12 i12   h12 g12 f12 e12   d12 c12 b12 a12}
+; r13  = {p13 o13 n13 m13   l13 k13 j13 i13   h13 g13 f13 e13   d13 c13 b13 a13}
+; r14  = {p14 o14 n14 m14   l14 k14 j14 i14   h14 g14 f14 e14   d14 c14 b14 a14}
+; r15  = {p15 o15 n15 m15   l15 k15 j15 i15   h15 g15 f15 e15   d15 c15 b15 a15}
+
+
+	; process top half (r0..r3) {a...d}
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {b13 b12 a13 a12   b9  b8  a9  a8   b5 b4 a5 a4   b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {b15 b14 a15 a14   b11 b10 a11 a10  b7 b6 a7 a6   b3 b2 a3 a2}
+	vshufps	%%t1, %%r2, %%r3, 0x44	; t1 = {d13 d12 c13 c12   d9  d8  c9  c8   d5 d4 c5 c4   d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {d15 d14 c15 c14   d11 d10 c11 c10  d7 d6 c7 c6   d3 d2 c3 c2}
+
+	vshufps	%%r3, %%t0, %%t1, 0xDD	; r3 = {d13 c13 b13 a13   d9  c9  b9  a9   d5 c5 b5 a5   d1 c1 b1 a1}
+	vshufps	%%r1, %%r0, %%r2, 0x88	; r1 = {d14 c14 b14 a14   d10 c10 b10 a10  d6 c6 b6 a6   d2 c2 b2 a2}
+	vshufps	%%r0, %%r0, %%r2, 0xDD	; r0 = {d15 c15 b15 a15   d11 c11 b11 a11  d7 c7 b7 a7   d3 c3 b3 a3}
+	vshufps	%%t0, %%t0, %%t1, 0x88	; t0 = {d12 c12 b12 a12   d8  c8  b8  a8   d4 c4 b4 a4   d0 c0 b0 a0}
+
+	; use r2 in place of t0
+	vshufps	%%r2, %%r4, %%r5, 0x44	; r2 = {f13 f12 e13 e12   f9  f8  e9  e8   f5 f4 e5 e4   f1 f0 e1 e0}
+	vshufps	%%r4, %%r4, %%r5, 0xEE	; r4 = {f15 f14 e15 e14   f11 f10 e11 e10  f7 f6 e7 e6   f3 f2 e3 e2}
+	vshufps %%t1, %%r6, %%r7, 0x44	; t1 = {h13 h12 g13 g12   h9  h8  g9  g8   h5 h4 g5 g4   h1 h0 g1 g0}
+	vshufps	%%r6, %%r6, %%r7, 0xEE	; r6 = {h15 h14 g15 g14   h11 h10 g11 g10  h7 h6 g7 g6   h3 h2 g3 g2}
+
+	vshufps	%%r7, %%r2, %%t1, 0xDD	; r7 = {h13 g13 f13 e13   h9  g9  f9  e9   h5 g5 f5 e5   h1 g1 f1 e1}
+	vshufps	%%r5, %%r4, %%r6, 0x88	; r5 = {h14 g14 f14 e14   h10 g10 f10 e10  h6 g6 f6 e6   h2 g2 f2 e2}
+	vshufps	%%r4, %%r4, %%r6, 0xDD	; r4 = {h15 g15 f15 e15   h11 g11 f11 e11  h7 g7 f7 e7   h3 g3 f3 e3}
+	vshufps	%%r2, %%r2, %%t1, 0x88	; r2 = {h12 g12 f12 e12   h8  g8  f8  e8   h4 g4 f4 e4   h0 g0 f0 e0}
+
+	; use r6 in place of t0
+	vshufps	%%r6, %%r8, %%r9,    0x44	; r6  = {j13 j12 i13 i12   j9  j8  i9  i8   j5 j4 i5 i4   j1 j0 i1 i0}
+	vshufps	%%r8, %%r8, %%r9,    0xEE	; r8  = {j15 j14 i15 i14   j11 j10 i11 i10  j7 j6 i7 i6   j3 j2 i3 i2}
+	vshufps	%%t1, %%r10, %%r11,  0x44	; t1  = {l13 l12 k13 k12   l9  l8  k9  k8   l5 l4 k5 k4   l1 l0 k1 k0}
+	vshufps	%%r10, %%r10, %%r11, 0xEE	; r10 = {l15 l14 k15 k14   l11 l10 k11 k10  l7 l6 k7 k6   l3 l2 k3 k2}
+
+	vshufps	%%r11, %%r6, %%t1, 0xDD		; r11 = {l13 k13 j13 113   l9  k9  j9  i9   l5 k5 j5 i5   l1 k1 j1 i1}
+	vshufps	%%r9, %%r8, %%r10, 0x88		; r9  = {l14 k14 j14 114   l10 k10 j10 i10  l6 k6 j6 i6   l2 k2 j2 i2}
+	vshufps	%%r8, %%r8, %%r10, 0xDD		; r8  = {l15 k15 j15 115   l11 k11 j11 i11  l7 k7 j7 i7   l3 k3 j3 i3}
+	vshufps	%%r6, %%r6, %%t1,  0x88		; r6  = {l12 k12 j12 112   l8  k8  j8  i8   l4 k4 j4 i4   l0 k0 j0 i0}
+
+	; use r10 in place of t0
+	vshufps	%%r10, %%r12, %%r13, 0x44	; r10 = {n13 n12 m13 m12   n9  n8  m9  m8   n5 n4 m5 m4   n1 n0 a1 m0}
+	vshufps	%%r12, %%r12, %%r13, 0xEE	; r12 = {n15 n14 m15 m14   n11 n10 m11 m10  n7 n6 m7 m6   n3 n2 a3 m2}
+	vshufps	%%t1, %%r14, %%r15,  0x44	; t1  = {p13 p12 013 012   p9  p8  09  08   p5 p4 05 04   p1 p0 01 00}
+	vshufps	%%r14, %%r14, %%r15, 0xEE	; r14 = {p15 p14 015 014   p11 p10 011 010  p7 p6 07 06   p3 p2 03 02}
+
+	vshufps	%%r15, %%r10, %%t1,  0xDD	; r15 = {p13 013 n13 m13   p9  09  n9  m9   p5 05 n5 m5   p1 01 n1 m1}
+	vshufps	%%r13, %%r12, %%r14, 0x88	; r13 = {p14 014 n14 m14   p10 010 n10 m10  p6 06 n6 m6   p2 02 n2 m2}
+	vshufps	%%r12, %%r12, %%r14, 0xDD	; r12 = {p15 015 n15 m15   p11 011 n11 m11  p7 07 n7 m7   p3 03 n3 m3}
+	vshufps	%%r10, %%r10, %%t1,  0x88	; r10 = {p12 012 n12 m12   p8  08  n8  m8   p4 04 n4 m4   p0 00 n0 m0}
+
+;; At this point, the registers that contain interesting data are:
+;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12
+;; Can use t1 and r14 as scratch registers
+
+	vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r14, %%t0, %%r2		; r14 = {h8  g8  f8  e8   d8  c8  b8  a8   h0 g0 f0 e0	 d0 c0 b0 a0}
+	vmovdqa32 %%t1,  [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%t1,  %%t0, %%r2		; t1  = {h12 g12 f12 e12  d12 c12 b12 a12  h4 g4 f4 e4	 d4 c4 b4 a4}
+
+	vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r2, %%r3, %%r7		; r2  = {h9  g9  f9  e9   d9  c9  b9  a9   h1 g1 f1 e1	 d1 c1 b1 a1}
+	vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%t0, %%r3, %%r7		; t0  = {h13 g13 f13 e13  d13 c13 b13 a13  h5 g5 f5 e5	 d5 c5 b5 a5}
+
+	vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r3, %%r1, %%r5		; r3  = {h10 g10 f10 e10  d10 c10 b10 a10  h2 g2 f2 e2	 d2 c2 b2 a2}
+	vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r7, %%r1, %%r5		; r7  = {h14 g14 f14 e14  d14 c14 b14 a14  h6 g6 f6 e6	 d6 c6 b6 a6}
+
+	vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r1, %%r0, %%r4		; r1  = {h11 g11 f11 e11  d11 c11 b11 a11  h3 g3 f3 e3	 d3 c3 b3 a3}
+	vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r5, %%r0, %%r4		; r5  = {h15 g15 f15 e15  d15 c15 b15 a15  h7 g7 f7 e7	 d7 c7 b7 a7}
+
+	vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r0, %%r6, %%r10		; r0 = {p8  o8  n8  m8   l8  k8  j8  i8   p0 o0 n0 m0	 l0 k0 j0 i0}
+	vmovdqa32 %%r4,  [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r4, %%r6, %%r10		; r4  = {p12 o12 n12 m12  l12 k12 j12 i12  p4 o4 n4 m4	 l4 k4 j4 i4}
+
+	vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r6, %%r11, %%r15		; r6  = {p9  o9  n9  m9   l9  k9  j9  i9   p1 o1 n1 m1	 l1 k1 j1 i1}
+	vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r10, %%r11, %%r15		; r10 = {p13 o13 n13 m13  l13 k13 j13 i13  p5 o5 n5 m5	 l5 k5 j5 i5}
+
+	vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r11, %%r9, %%r13		; r11 = {p10 o10 n10 m10  l10 k10 j10 i10  p2 o2 n2 m2	 l2 k2 j2 i2}
+	vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r15, %%r9, %%r13		; r15 = {p14 o14 n14 m14  l14 k14 j14 i14  p6 o6 n6 m6	 l6 k6 j6 i6}
+
+	vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r9, %%r8, %%r12		; r9  = {p11 o11 n11 m11  l11 k11 j11 i11  p3 o3 n3 m3	 l3 k3 j3 i3}
+	vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r13, %%r8, %%r12		; r13 = {p15 o15 n15 m15  l15 k15 j15 i15  p7 o7 n7 m7	 l7 k7 j7 i7}
+
+;; At this point r8 and r12 can be used as scratch registers
+
+	vshuff64x2 %%r8, %%r14, %%r0, 0xEE 	; r8  = {p8  o8  n8  m8   l8  k8  j8  i8   h8 g8 f8 e8   d8 c8 b8 a8}
+	vshuff64x2 %%r0, %%r14, %%r0, 0x44 	; r0  = {p0  o0  n0  m0   l0  k0  j0  i0   h0 g0 f0 e0   d0 c0 b0 a0}
+
+	vshuff64x2 %%r12, %%t1, %%r4, 0xEE 	; r12 = {p12 o12 n12 m12  l12 k12 j12 i12  h12 g12 f12 e12  d12 c12 b12 a12}
+	vshuff64x2 %%r4, %%t1, %%r4, 0x44 	; r4  = {p4  o4  n4  m4   l4  k4  j4  i4   h4 g4 f4 e4   d4 c4 b4 a4}
+
+	vshuff64x2 %%r14, %%r7, %%r15, 0xEE 	; r14 = {p14 o14 n14 m14  l14 k14 j14 i14  h14 g14 f14 e14  d14 c14 b14 a14}
+	vshuff64x2 %%t1, %%r7, %%r15, 0x44 	; t1  = {p6  o6  n6  m6   l6  k6  j6  i6   h6 g6 f6 e6   d6 c6 b6 a6}
+
+	vshuff64x2 %%r15, %%r5, %%r13, 0xEE 	; r15 = {p15 o15 n15 m15  l15 k15 j15 i15  h15 g15 f15 e15  d15 c15 b15 a15}
+	vshuff64x2 %%r7, %%r5, %%r13, 0x44 	; r7  = {p7  o7  n7  m7   l7  k7  j7  i7   h7 g7 f7 e7   d7 c7 b7 a7}
+
+	vshuff64x2 %%r13, %%t0, %%r10, 0xEE 	; r13 = {p13 o13 n13 m13  l13 k13 j13 i13  h13 g13 f13 e13  d13 c13 b13 a13}
+	vshuff64x2 %%r5, %%t0, %%r10, 0x44 	; r5  = {p5  o5  n5  m5   l5  k5  j5  i5   h5 g5 f5 e5   d5 c5 b5 a5}
+
+	vshuff64x2 %%r10, %%r3, %%r11, 0xEE 	; r10 = {p10 o10 n10 m10  l10 k10 j10 i10  h10 g10 f10 e10  d10 c10 b10 a10}
+	vshuff64x2 %%t0, %%r3, %%r11, 0x44 	; t0  = {p2  o2  n2  m2   l2  k2  j2  i2   h2 g2 f2 e2   d2 c2 b2 a2}
+
+	vshuff64x2 %%r11, %%r1, %%r9, 0xEE 	; r11 = {p11 o11 n11 m11  l11 k11 j11 i11  h11 g11 f11 e11  d11 c11 b11 a11}
+	vshuff64x2 %%r3, %%r1, %%r9, 0x44 	; r3  = {p3  o3  n3  m3   l3  k3  j3  i3   h3 g3 f3 e3   d3 c3 b3 a3}
+
+	vshuff64x2 %%r9, %%r2, %%r6, 0xEE 	; r9  = {p9  o9  n9  m9   l9  k9  j9  i9   h9 g9 f9 e9   d9 c9 b9 a9}
+	vshuff64x2 %%r1, %%r2, %%r6, 0x44 	; r1  = {p1  o1  n1  m1   l1  k1  j1  i1   h1 g1 f1 e1   d1 c1 b1 a1}
+
+	vmovdqa32 %%r2, %%t0			; r2  = {p2  o2  n2  m2   l2  k2  j2  i2   h2 g2 f2 e2   d2 c2 b2 a2}
+	vmovdqa32 %%r6, %%t1			; r6  = {p6  o6  n6  m6   l6  k6  j6  i6   h6 g6 f6 e6   d6 c6 b6 a6}
+
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ H
+%xdefine H G
+%xdefine G F
+%xdefine F E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+;;  CH(A, B, C) = (A&B) ^ (~A&C)
+;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
+;; SIGMA0 = ROR_2  ^ ROR_13 ^ ROR_22
+;; SIGMA1 = ROR_6  ^ ROR_11 ^ ROR_25
+;; sigma0 = ROR_7  ^ ROR_18 ^ SHR_3
+;; sigma1 = ROR_17 ^ ROR_19 ^ SHR_10
+
+; Main processing loop per round
+%macro PROCESS_LOOP 2
+%define %%WT	%1
+%define %%ROUND	%2
+	;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
+	;; T2 = SIGMA0(A) + MAJ(A, B, C)
+	;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+	;; H becomes T2, then add T1 for A
+	;; D becomes D + T1 for E
+
+	vpaddd		T1, H, TMP3		; T1 = H + Kt
+	vmovdqa32	TMP0, E
+	vprord		TMP1, E, 6 		; ROR_6(E)
+	vprord		TMP2, E, 11 		; ROR_11(E)
+	vprord		TMP3, E, 25 		; ROR_25(E)
+	vpternlogd	TMP0, F, G, 0xCA	; TMP0 = CH(E,F,G)
+	vpaddd		T1, T1, %%WT		; T1 = T1 + Wt
+	vpternlogd	TMP1, TMP2, TMP3, 0x96	; TMP1 = SIGMA1(E)
+	vpaddd		T1, T1, TMP0		; T1 = T1 + CH(E,F,G)
+	vpaddd		T1, T1, TMP1		; T1 = T1 + SIGMA1(E)
+	vpaddd		D, D, T1		; D = D + T1
+
+	vprord		H, A, 2 		; ROR_2(A)
+	vprord		TMP2, A, 13 		; ROR_13(A)
+	vprord		TMP3, A, 22 		; ROR_22(A)
+	vmovdqa32	TMP0, A
+	vpternlogd	TMP0, B, C, 0xE8	; TMP0 = MAJ(A,B,C)
+	vpternlogd	H, TMP2, TMP3, 0x96	; H(T2) = SIGMA0(A)
+	vpaddd		H, H, TMP0		; H(T2) = SIGMA0(A) + MAJ(A,B,C)
+	vpaddd		H, H, T1		; H(A) = H(T2) + T1
+
+	vmovdqa32	TMP3, [TBL + ((%%ROUND+1)*64)]	; Next Kt
+
+	;; Rotate the args A-H (rotation of names associated with regs)
+	ROTATE_ARGS
+%endmacro
+
+; This is supposed to be SKL optimized assuming:
+; vpternlog, vpaddd ports 5,8
+; vprord ports 1,8
+; However, vprord is only working on port 8
+;
+; Main processing loop per round
+; Get the msg schedule word 16 from the current, now unneccessary word
+%macro PROCESS_LOOP_00_47 5
+%define %%WT	%1
+%define %%ROUND	%2
+%define %%WTp1	%3
+%define %%WTp9	%4
+%define %%WTp14	%5
+	;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
+	;; T2 = SIGMA0(A) + MAJ(A, B, C)
+	;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+	;; H becomes T2, then add T1 for A
+	;; D becomes D + T1 for E
+
+	;; For next value in msg schedule
+	;; Wt+16 = sigma1(Wt+14) + Wt+9 + sigma0(Wt+1) + Wt
+
+	vmovdqa32	TMP0, E
+	vprord		TMP1, E, 6 		; ROR_6(E)
+	vprord		TMP2, E, 11 		; ROR_11(E)
+	vprord		TMP3, E, 25 		; ROR_25(E)
+	vpternlogd	TMP0, F, G, 0xCA	; TMP0 = CH(E,F,G)
+	vpaddd		T1, H, %%WT		; T1 = H + Wt
+	vpternlogd	TMP1, TMP2, TMP3, 0x96	; TMP1 = SIGMA1(E)
+	vpaddd		T1, T1, TMP6		; T1 = T1 + Kt
+	vprord		H, A, 2 		; ROR_2(A)
+	vpaddd		T1, T1, TMP0		; T1 = T1 + CH(E,F,G)
+	vprord		TMP2, A, 13 		; ROR_13(A)
+	vmovdqa32	TMP0, A
+	vprord		TMP3, A, 22 		; ROR_22(A)
+	vpaddd		T1, T1, TMP1		; T1 = T1 + SIGMA1(E)
+	vpternlogd	TMP0, B, C, 0xE8	; TMP0 = MAJ(A,B,C)
+	vpaddd		D, D, T1		; D = D + T1
+	vpternlogd	H, TMP2, TMP3, 0x96	; H(T2) = SIGMA0(A)
+	vprord		TMP4, %%WTp14, 17 	; ROR_17(Wt-2)
+	vpaddd		H, H, TMP0		; H(T2) = SIGMA0(A) + MAJ(A,B,C)
+	vprord		TMP5, %%WTp14, 19 	; ROR_19(Wt-2)
+	vpsrld		TMP6, %%WTp14, 10 	; SHR_10(Wt-2)
+	vpaddd		H, H, T1		; H(A) = H(T2) + T1
+	vpternlogd	TMP4, TMP5, TMP6, 0x96	; TMP4 = sigma1(Wt-2)
+	vpaddd		%%WT, %%WT, TMP4	; Wt = Wt-16 + sigma1(Wt-2)
+	vprord		TMP4, %%WTp1, 7 	; ROR_7(Wt-15)
+	vprord		TMP5, %%WTp1, 18 	; ROR_18(Wt-15)
+	vpaddd		%%WT, %%WT, %%WTp9	; Wt = Wt-16 + sigma1(Wt-2) + Wt-7
+	vpsrld		TMP6, %%WTp1, 3 	; SHR_3(Wt-15)
+	vpternlogd	TMP4, TMP5, TMP6, 0x96	; TMP4 = sigma0(Wt-15)
+	vpaddd		%%WT, %%WT, TMP4	; Wt = Wt-16 + sigma1(Wt-2) +
+						;      Wt-7 + sigma0(Wt-15) +
+
+	vmovdqa32	TMP6, [TBL + ((%%ROUND+1)*64)]	; Next Kt
+
+	;; Rotate the args A-H (rotation of names associated with regs)
+	ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_63 4
+%define %%WT	%1
+%define %%WTp1	%2
+%define %%WTp9	%3
+%define %%WTp14	%4
+	vprord		TMP4, %%WTp14, 17 	; ROR_17(Wt-2)
+	vprord		TMP5, %%WTp14, 19 	; ROR_19(Wt-2)
+	vpsrld		TMP6, %%WTp14, 10 	; SHR_10(Wt-2)
+	vpternlogd	TMP4, TMP5, TMP6, 0x96	; TMP4 = sigma1(Wt-2)
+
+	vpaddd		%%WT, %%WT, TMP4	; Wt = Wt-16 + sigma1(Wt-2)
+	vpaddd		%%WT, %%WT, %%WTp9	; Wt = Wt-16 + sigma1(Wt-2) + Wt-7
+
+	vprord		TMP4, %%WTp1, 7 	; ROR_7(Wt-15)
+	vprord		TMP5, %%WTp1, 18 	; ROR_18(Wt-15)
+	vpsrld		TMP6, %%WTp1, 3 	; SHR_3(Wt-15)
+	vpternlogd	TMP4, TMP5, TMP6, 0x96	; TMP4 = sigma0(Wt-15)
+
+	vpaddd		%%WT, %%WT, TMP4	; Wt = Wt-16 + sigma1(Wt-2) +
+						;      Wt-7 + sigma0(Wt-15) +
+%endmacro
+
+; Note this is reading in a block of data for one lane
+; When all 16 are read, the data must be transposed to build msg schedule
+%macro MSG_SCHED_ROUND_00_15 2
+%define %%WT	 %1
+%define %%OFFSET %2
+	mov		inp0, [IN + (%%OFFSET*8)]
+	vmovups		%%WT, [inp0+IDX]
+%endmacro
+
+align 64
+
+;; void sha256_mb_x16_avx512(SHA256_MB_ARGS_X16, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+local_func_decl(sha256_mb_x16_avx512)
+sha256_mb_x16_avx512:
+	endbranch
+	mov	rax, rsp
+        sub     rsp, STACK_SPACE
+	and	rsp, ~63	; align stack to multiple of 64
+	mov	[rsp + _rsp], rax
+	lea	TBL, [TABLE]
+
+	;; Initialize digests
+	vmovups	A, [DIGEST + 0*64]
+	vmovups	B, [DIGEST + 1*64]
+	vmovups	C, [DIGEST + 2*64]
+	vmovups	D, [DIGEST + 3*64]
+	vmovups	E, [DIGEST + 4*64]
+	vmovups	F, [DIGEST + 5*64]
+	vmovups	G, [DIGEST + 6*64]
+	vmovups	H, [DIGEST + 7*64]
+
+	; Do we need to transpose digests???
+	; SHA1 does not, but SHA256 has been
+
+	xor IDX, IDX
+
+	;; Read in first block of input data
+	;; Transpose input data
+	mov	inp0, [IN + 0*8]
+	mov	inp1, [IN + 1*8]
+	mov	inp2, [IN + 2*8]
+	mov	inp3, [IN + 3*8]
+	mov	inp4, [IN + 4*8]
+	mov	inp5, [IN + 5*8]
+	mov	inp6, [IN + 6*8]
+	mov	inp7, [IN + 7*8]
+
+	vmovups	W0,[inp0+IDX]
+	vmovups	W1,[inp1+IDX]
+	vmovups	W2,[inp2+IDX]
+	vmovups	W3,[inp3+IDX]
+	vmovups	W4,[inp4+IDX]
+	vmovups	W5,[inp5+IDX]
+	vmovups	W6,[inp6+IDX]
+	vmovups	W7,[inp7+IDX]
+
+	mov	inp0, [IN + 8*8]
+	mov	inp1, [IN + 9*8]
+	mov	inp2, [IN +10*8]
+	mov	inp3, [IN +11*8]
+	mov	inp4, [IN +12*8]
+	mov	inp5, [IN +13*8]
+	mov	inp6, [IN +14*8]
+	mov	inp7, [IN +15*8]
+
+	vmovups	W8, [inp0+IDX]
+	vmovups	W9, [inp1+IDX]
+	vmovups	W10,[inp2+IDX]
+	vmovups	W11,[inp3+IDX]
+	vmovups	W12,[inp4+IDX]
+	vmovups	W13,[inp5+IDX]
+	vmovups	W14,[inp6+IDX]
+	vmovups	W15,[inp7+IDX]
+
+
+lloop:
+	vmovdqa32	TMP2, [PSHUFFLE_BYTE_FLIP_MASK]
+
+	vmovdqa32	TMP3, [TBL]	; First K
+
+	; Save digests for later addition
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*0], A
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*1], B
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*2], C
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*3], D
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*4], E
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*5], F
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*6], G
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*7], H
+
+	add	IDX, 64
+
+	TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+
+%assign I 0
+%rep 16
+       	vpshufb	APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+
+	; MSG Schedule for W0-W15 is now complete in registers
+	; Process first 48 rounds
+	; Calculate next Wt+16 after processing is complete and Wt is unneeded
+
+	; PROCESS_LOOP_00_47 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M)
+
+%assign I 0
+%assign J 0
+%assign K 1
+%assign L 9
+%assign M 14
+%rep 48
+	PROCESS_LOOP  APPEND(W,J),  I
+	MSG_SCHED_ROUND_16_63  APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+%assign I (I+1)
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%endrep
+
+	; Check is this is the last block
+	sub 	SIZE, 1
+	je	lastLoop
+
+	; Process last 16 rounds
+	; Read in next block msg data for use in first 16 words of msg sched
+%assign I 48
+%assign J 0
+%rep 16
+	PROCESS_LOOP  APPEND(W,J), I
+	MSG_SCHED_ROUND_00_15  APPEND(W,J), J
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+	; Add old digest
+        vpaddd		A, A, [rsp + _DIGEST_SAVE + 64*0]
+        vpaddd		B, B, [rsp + _DIGEST_SAVE + 64*1]
+        vpaddd		C, C, [rsp + _DIGEST_SAVE + 64*2]
+        vpaddd		D, D, [rsp + _DIGEST_SAVE + 64*3]
+        vpaddd		E, E, [rsp + _DIGEST_SAVE + 64*4]
+        vpaddd		F, F, [rsp + _DIGEST_SAVE + 64*5]
+        vpaddd		G, G, [rsp + _DIGEST_SAVE + 64*6]
+        vpaddd		H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+	jmp	lloop
+
+lastLoop:
+	; Process last 16 rounds
+%assign I 48
+%assign J 0
+%rep 16
+	PROCESS_LOOP  APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+	; Add old digest
+        vpaddd		A, A, [rsp + _DIGEST_SAVE + 64*0]
+        vpaddd		B, B, [rsp + _DIGEST_SAVE + 64*1]
+        vpaddd		C, C, [rsp + _DIGEST_SAVE + 64*2]
+        vpaddd		D, D, [rsp + _DIGEST_SAVE + 64*3]
+        vpaddd		E, E, [rsp + _DIGEST_SAVE + 64*4]
+        vpaddd		F, F, [rsp + _DIGEST_SAVE + 64*5]
+        vpaddd		G, G, [rsp + _DIGEST_SAVE + 64*6]
+        vpaddd		H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+        ;; update into data pointers
+%assign I 0
+%rep 8
+        mov    inp0, [IN + (2*I)*8]
+        mov    inp1, [IN + (2*I +1)*8]
+        add    inp0, IDX
+        add    inp1, IDX
+        mov    [IN + (2*I)*8], inp0
+        mov    [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+	; Write out digest
+	; Do we need to untranspose digests???
+	vmovups	[DIGEST + 0*64], A
+	vmovups	[DIGEST + 1*64], B
+	vmovups	[DIGEST + 2*64], C
+	vmovups	[DIGEST + 3*64], D
+	vmovups	[DIGEST + 4*64], E
+	vmovups	[DIGEST + 5*64], F
+	vmovups	[DIGEST + 6*64], G
+	vmovups	[DIGEST + 7*64], H
+
+
+        mov     rsp, [rsp + _rsp]
+        ret
+
+        section .data
+align 64
+TABLE:
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+PSHUFFLE_TRANSPOSE16_MASK1: 	dq 0x0000000000000000
+				dq 0x0000000000000001
+				dq 0x0000000000000008
+				dq 0x0000000000000009
+				dq 0x0000000000000004
+				dq 0x0000000000000005
+				dq 0x000000000000000C
+				dq 0x000000000000000D
+
+PSHUFFLE_TRANSPOSE16_MASK2: 	dq 0x0000000000000002
+				dq 0x0000000000000003
+				dq 0x000000000000000A
+				dq 0x000000000000000B
+				dq 0x0000000000000006
+				dq 0x0000000000000007
+				dq 0x000000000000000E
+				dq 0x000000000000000F
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_mb_x16_avx512
+no_sha256_mb_x16_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm
new file mode 100644
index 000000000..7f8f8829b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm
@@ -0,0 +1,431 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute quad SHA256 using AVX
+;; Logic designed/laid out by JDG
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {b3 b2 a3 a2}
+
+	vshufps	%%t1, %%r2, %%r3, 0x44	; t1 = {d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {d3 d2 c3 c2}
+
+	vshufps	%%r1, %%t0, %%t1, 0xDD	; r1 = {d1 c1 b1 a1}
+
+	vshufps	%%r3, %%r0, %%r2, 0xDD	; r3 = {d3 c3 b3 a3}
+
+	vshufps	%%r0, %%r0, %%r2, 0x88	; r0 = {d2 c2 b2 a2}
+	vshufps	%%t0, %%t0, %%t1, 0x88	; t0 = {d0 c0 b0 a0}
+%endmacro
+
+
+%define TABLE	K256_4_MB
+%define SZ	4
+%define SZ4	4*SZ
+%define ROUNDS 64*SZ4
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1  xmm14
+%define TMP xmm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpslld	%%tmp, %%reg, (32-(%%imm))
+	vpsrld	%%reg, %%reg, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	vpslld	%%tmp, %%src, (32-(%%imm))
+	vpsrld	%%reg, %%src, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+	PRORD	%1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+	PRORD_nd	%1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i  %2
+
+
+	PRORD_nd	a0, e, (11-6)	; sig1: a0 = (e >> 5)
+
+	vpxor	a2, f, g	; ch: a2 = f^g
+	vpand	a2, e		; ch: a2 = (f^g)&e
+	vpxor	a2, g		; a2 = ch
+
+	PRORD_nd	a1, e, 25		; sig1: a1 = (e >> 25)
+	vmovdqa	[SZ4*(%%i&0xf) + rsp], %%T1
+	vpaddd	%%T1, %%T1, [TBL + ROUND]	; T1 = W + K
+	vpxor	a0, a0, e	; sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		; sig1: a0 = (e >> 6) ^ (e >> 11)
+	vpaddd	h, h, a2	; h = h + ch
+	PRORD_nd	a2, a, (13-2)	; sig0: a2 = (a >> 11)
+	vpaddd	h, h, %%T1	; h = h + ch + W + K
+	vpxor	a0, a0, a1	; a0 = sigma1
+	PRORD_nd	a1, a, 22	; sig0: a1 = (a >> 22)
+	vpxor	%%T1, a, c	; maj: T1 = a^c
+	add	ROUND, SZ4	; ROUND++
+	vpand	%%T1, %%T1, b	; maj: T1 = (a^c)&b
+	vpaddd	h, h, a0
+
+	vpaddd	d, d, h
+
+	vpxor	a2, a2, a	; sig0: a2 = a ^ (a >> 11)
+	PRORD	a2, 2		; sig0: a2 = (a >> 2) ^ (a >> 13)
+	vpxor	a2, a2, a1	; a2 = sig0
+	vpand	a1, a, c	; maj: a1 = a&c
+	vpor	a1, a1, %%T1	; a1 = maj
+	vpaddd	h, h, a1	; h = h + ch + W + K + maj
+	vpaddd	h, h, a2	; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i  %2
+
+	vmovdqa	%%T1, [SZ4*((%%i-15)&0xf) + rsp]
+	vmovdqa	a1, [SZ4*((%%i-2)&0xf) + rsp]
+	vmovdqa	a0, %%T1
+	PRORD	%%T1, 18-7
+	vmovdqa	a2, a1
+	PRORD	a1, 19-17
+	vpxor	%%T1, %%T1, a0
+	PRORD	%%T1, 7
+	vpxor	a1, a1, a2
+	PRORD	a1, 17
+	vpsrld	a0, a0, 3
+	vpxor	%%T1, %%T1, a0
+	vpsrld	a2, a2, 10
+	vpxor	a1, a1, a2
+	vpaddd	%%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp]
+	vpaddd	a1, a1, [SZ4*((%%i-7)&0xf) + rsp]
+	vpaddd	%%T1, %%T1, a1
+
+	ROUND_00_15 %%T1, %%i
+%endm
+
+%define DIGEST_SIZE	8*SZ4
+%define DATA	       16*SZ4
+%define ALIGNMENT       1*8
+; ALIGNMENT makes FRAMESZ + pushes an odd multiple of 8
+%define FRAMESZ (DATA + DIGEST_SIZE + ALIGNMENT)
+%define _DIGEST (DATA)
+
+%define VMOVPS	vmovups
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux definitions
+ %define arg1 	rdi
+ %define arg2	rsi
+%else
+ ; Windows definitions
+ %define arg1 	rcx
+ %define arg2 	rdx
+%endif
+
+; Common definitions
+%define IDX     rax
+%define ROUND	rbx
+%define TBL	r12
+
+;; void sha256_mb_x4_avx(SHA256_MB_ARGS_X8 *args, uint64_t len);
+;; arg 1 : arg1 : pointer args (only 4 of the 8 lanes used)
+;; arg 2 : arg2 : size of data in blocks (assumed >= 1)
+;;
+;; Clobbers registers: arg2, rax, rbx, r8-r12, xmm0-xmm15
+;;
+mk_global sha256_mb_x4_avx, function, internal
+align 32
+sha256_mb_x4_avx:
+	endbranch
+	sub	rsp, FRAMESZ
+
+	;; Initialize digests
+	vmovdqa	a,[arg1+0*SZ4]
+	vmovdqa	b,[arg1+1*SZ4]
+	vmovdqa	c,[arg1+2*SZ4]
+	vmovdqa	d,[arg1+3*SZ4]
+	vmovdqa	e,[arg1+4*SZ4]
+	vmovdqa	f,[arg1+5*SZ4]
+	vmovdqa	g,[arg1+6*SZ4]
+	vmovdqa	h,[arg1+7*SZ4]
+
+	lea	TBL,[TABLE]
+
+	;; transpose input onto stack
+	mov	inp0,[arg1 + _data_ptr + 0*8]
+	mov	inp1,[arg1 + _data_ptr + 1*8]
+	mov	inp2,[arg1 + _data_ptr + 2*8]
+	mov	inp3,[arg1 + _data_ptr + 3*8]
+
+	xor	IDX, IDX
+lloop:
+	xor	ROUND, ROUND
+
+	;; save old digest
+	vmovdqa	[rsp + _DIGEST + 0*SZ4], a
+	vmovdqa	[rsp + _DIGEST + 1*SZ4], b
+	vmovdqa	[rsp + _DIGEST + 2*SZ4], c
+	vmovdqa	[rsp + _DIGEST + 3*SZ4], d
+	vmovdqa	[rsp + _DIGEST + 4*SZ4], e
+	vmovdqa	[rsp + _DIGEST + 5*SZ4], f
+	vmovdqa	[rsp + _DIGEST + 6*SZ4], g
+	vmovdqa	[rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+	vmovdqa	TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+	VMOVPS	TT2,[inp0+IDX+i*16]
+	VMOVPS	TT1,[inp1+IDX+i*16]
+	VMOVPS	TT4,[inp2+IDX+i*16]
+	VMOVPS	TT3,[inp3+IDX+i*16]
+	TRANSPOSE	TT2, TT1, TT4, TT3, TT0, TT5
+	vpshufb	TT0, TT0, TMP
+	vpshufb	TT1, TT1, TMP
+	vpshufb	TT2, TT2, TMP
+	vpshufb	TT3, TT3, TMP
+	ROUND_00_15	TT0,(i*4+0)
+	ROUND_00_15	TT1,(i*4+1)
+	ROUND_00_15	TT2,(i*4+2)
+	ROUND_00_15	TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+	add	IDX, 4*4*4
+
+
+%assign i (i*4)
+
+	jmp	Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+	ROUND_16_XX	T1, i
+%assign i (i+1)
+%endrep
+
+	cmp	ROUND,ROUNDS
+	jb	Lrounds_16_xx
+
+	;; add old digest
+	vpaddd	a, a, [rsp + _DIGEST + 0*SZ4]
+	vpaddd	b, b, [rsp + _DIGEST + 1*SZ4]
+	vpaddd	c, c, [rsp + _DIGEST + 2*SZ4]
+	vpaddd	d, d, [rsp + _DIGEST + 3*SZ4]
+	vpaddd	e, e, [rsp + _DIGEST + 4*SZ4]
+	vpaddd	f, f, [rsp + _DIGEST + 5*SZ4]
+	vpaddd	g, g, [rsp + _DIGEST + 6*SZ4]
+	vpaddd	h, h, [rsp + _DIGEST + 7*SZ4]
+
+
+	sub	arg2, 1
+	jne	lloop
+
+	; write digests out
+	vmovdqa	[arg1+0*SZ4],a
+	vmovdqa	[arg1+1*SZ4],b
+	vmovdqa	[arg1+2*SZ4],c
+	vmovdqa	[arg1+3*SZ4],d
+	vmovdqa	[arg1+4*SZ4],e
+	vmovdqa	[arg1+5*SZ4],f
+	vmovdqa	[arg1+6*SZ4],g
+	vmovdqa	[arg1+7*SZ4],h
+
+	; update input pointers
+	add	inp0, IDX
+	mov	[arg1 + _data_ptr + 0*8], inp0
+	add	inp1, IDX
+	mov	[arg1 + _data_ptr + 1*8], inp1
+	add	inp2, IDX
+	mov	[arg1 + _data_ptr + 2*8], inp2
+	add	inp3, IDX
+	mov	[arg1 + _data_ptr + 3*8], inp3
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+	add	rsp, FRAMESZ
+	ret
+
+section .data align=64
+
+align 64
+TABLE:
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm
new file mode 100644
index 000000000..2d349abbc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm
@@ -0,0 +1,426 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute quad SHA256 using SSE
+;; Logic designed/laid out by JDG
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+	movaps	%%t0, %%r0		; t0 = {a3 a2 a1 a0}
+	shufps	%%t0, %%r1, 0x44	; t0 = {b1 b0 a1 a0}
+	shufps	%%r0, %%r1, 0xEE	; r0 = {b3 b2 a3 a2}
+
+	movaps	%%t1, %%r2		; t1 = {c3 c2 c1 c0}
+	shufps	%%t1, %%r3, 0x44	; t1 = {d1 d0 c1 c0}
+	shufps	%%r2, %%r3, 0xEE	; r2 = {d3 d2 c3 c2}
+
+	movaps	%%r1, %%t0		; r1 = {b1 b0 a1 a0}
+	shufps	%%r1, %%t1, 0xDD	; r1 = {d1 c1 b1 a1}
+
+	movaps	%%r3, %%r0		; r3 = {b3 b2 a3 a2}
+	shufps	%%r3, %%r2, 0xDD	; r3 = {d3 c3 b3 a3}
+
+	shufps	%%r0, %%r2, 0x88	; r0 = {d2 c2 b2 a2}
+	shufps	%%t0, %%t1, 0x88	; t0 = {d0 c0 b0 a0}
+%endmacro
+
+
+%define TABLE	K256_4_MB
+%define SZ	4
+%define SZ4	4*SZ
+%define ROUNDS 64*SZ4
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1  xmm14
+%define TMP xmm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	movdqa	%%tmp, %%reg
+	psrld	%%reg, %%imm
+	pslld	%%tmp, (32-(%%imm))
+	por	%%reg, %%tmp
+%endmacro
+
+%macro PRORD 2
+	PRORD	%1, %2, TMP
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i  %2
+
+
+	movdqa	a0, e		; sig1: a0 = e
+	movdqa	a1, e		; sig1: s1 = e
+	PRORD	a0, (11-6)	; sig1: a0 = (e >> 5)
+
+	movdqa	a2, f		; ch: a2 = f
+	pxor	a2, g		; ch: a2 = f^g
+	pand	a2, e		; ch: a2 = (f^g)&e
+	pxor	a2, g		; a2 = ch
+
+	PRORD	a1, 25		; sig1: a1 = (e >> 25)
+	movdqa	[SZ4*(%%i&0xf) + rsp],%%T1
+	paddd	%%T1,[TBL + ROUND]	; T1 = W + K
+	pxor	a0, e		; sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		; sig1: a0 = (e >> 6) ^ (e >> 11)
+	paddd	h, a2		; h = h + ch
+	movdqa	a2, a		; sig0: a2 = a
+	PRORD	a2, (13-2)	; sig0: a2 = (a >> 11)
+	paddd	h, %%T1		; h = h + ch + W + K
+	pxor	a0, a1		; a0 = sigma1
+	movdqa	a1, a		; sig0: a1 = a
+	movdqa	%%T1, a		; maj: T1 = a
+	PRORD	a1, 22		; sig0: a1 = (a >> 22)
+	pxor	%%T1, c		; maj: T1 = a^c
+	add	ROUND, SZ4	; ROUND++
+	pand	%%T1, b		; maj: T1 = (a^c)&b
+	paddd	h, a0
+
+	paddd	d, h
+
+	pxor	a2, a		; sig0: a2 = a ^ (a >> 11)
+	PRORD	a2, 2		; sig0: a2 = (a >> 2) ^ (a >> 13)
+	pxor	a2, a1		; a2 = sig0
+	movdqa	a1, a		; maj: a1 = a
+	pand	a1, c		; maj: a1 = a&c
+	por	a1, %%T1	; a1 = maj
+	paddd	h, a1		; h = h + ch + W + K + maj
+	paddd	h, a2		; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i  %2
+
+	movdqa	%%T1, [SZ4*((%%i-15)&0xf) + rsp]
+	movdqa	a1, [SZ4*((%%i-2)&0xf) + rsp]
+	movdqa	a0, %%T1
+	PRORD	%%T1, 18-7
+	movdqa	a2, a1
+	PRORD	a1, 19-17
+	pxor	%%T1, a0
+	PRORD	%%T1, 7
+	pxor	a1, a2
+	PRORD	a1, 17
+	psrld	a0, 3
+	pxor	%%T1, a0
+	psrld	a2, 10
+	pxor	a1, a2
+	paddd	%%T1, [SZ4*((%%i-16)&0xf) + rsp]
+	paddd	a1, [SZ4*((%%i-7)&0xf) + rsp]
+	paddd	%%T1, a1
+
+	ROUND_00_15 %%T1, %%i
+%endm
+
+%define DIGEST_SIZE	8*SZ4
+%define DATA	       16*SZ4
+%define ALIGNMENT       1*8
+; ALIGNMENT makes FRAMESZ + pushes an odd multiple of 8
+%define FRAMESZ (DATA + DIGEST_SIZE + ALIGNMENT)
+%define _DIGEST (DATA)
+
+%define MOVPS	movups
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux definitions
+ %define arg1 	rdi
+ %define arg2	rsi
+%else
+ ; Windows definitions
+ %define arg1 	rcx
+ %define arg2 	rdx
+%endif
+
+; Common definitions
+%define IDX     rax
+%define ROUND	rbx
+%define TBL	r12
+
+;; void sha256_mb_x4_sse(SHA256_MB_ARGS_X8 *args, uint64_t len);
+;; arg 1 :  pointer args (only 4 of the 8 lanes used)
+;; arg 2 :  size of data in blocks (assumed >= 1)
+;;
+;; Clobbers registers: arg2, rax, rbx, r8-r12, xmm0-xmm15
+;;
+
+mk_global sha256_mb_x4_sse, function, internal
+align 32
+sha256_mb_x4_sse:
+	endbranch
+	sub	rsp, FRAMESZ
+
+	;; Initialize digests
+	movdqa	a,[arg1+0*SZ4]
+	movdqa	b,[arg1+1*SZ4]
+	movdqa	c,[arg1+2*SZ4]
+	movdqa	d,[arg1+3*SZ4]
+	movdqa	e,[arg1+4*SZ4]
+	movdqa	f,[arg1+5*SZ4]
+	movdqa	g,[arg1+6*SZ4]
+	movdqa	h,[arg1+7*SZ4]
+
+	lea	TBL,[TABLE]
+
+	;; transpose input onto stack
+	mov	inp0,[arg1 + _data_ptr + 0*8]
+	mov	inp1,[arg1 + _data_ptr + 1*8]
+	mov	inp2,[arg1 + _data_ptr + 2*8]
+	mov	inp3,[arg1 + _data_ptr + 3*8]
+
+	xor	IDX, IDX
+lloop:
+	xor	ROUND, ROUND
+
+	;; save old digest
+	movdqa	[rsp + _DIGEST + 0*SZ4], a
+	movdqa	[rsp + _DIGEST + 1*SZ4], b
+	movdqa	[rsp + _DIGEST + 2*SZ4], c
+	movdqa	[rsp + _DIGEST + 3*SZ4], d
+	movdqa	[rsp + _DIGEST + 4*SZ4], e
+	movdqa	[rsp + _DIGEST + 5*SZ4], f
+	movdqa	[rsp + _DIGEST + 6*SZ4], g
+	movdqa	[rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+	movdqa	TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+	MOVPS	TT2,[inp0+IDX+i*16]
+	MOVPS	TT1,[inp1+IDX+i*16]
+	MOVPS	TT4,[inp2+IDX+i*16]
+	MOVPS	TT3,[inp3+IDX+i*16]
+	TRANSPOSE	TT2, TT1, TT4, TT3, TT0, TT5
+	pshufb	TT0, TMP
+	pshufb	TT1, TMP
+	pshufb	TT2, TMP
+	pshufb	TT3, TMP
+	ROUND_00_15	TT0,(i*4+0)
+	ROUND_00_15	TT1,(i*4+1)
+	ROUND_00_15	TT2,(i*4+2)
+	ROUND_00_15	TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+	add	IDX, 4*4*4
+
+
+%assign i (i*4)
+
+	jmp	Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+	ROUND_16_XX	T1, i
+%assign i (i+1)
+%endrep
+
+	cmp	ROUND,ROUNDS
+	jb	Lrounds_16_xx
+
+	;; add old digest
+	paddd	a, [rsp + _DIGEST + 0*SZ4]
+	paddd	b, [rsp + _DIGEST + 1*SZ4]
+	paddd	c, [rsp + _DIGEST + 2*SZ4]
+	paddd	d, [rsp + _DIGEST + 3*SZ4]
+	paddd	e, [rsp + _DIGEST + 4*SZ4]
+	paddd	f, [rsp + _DIGEST + 5*SZ4]
+	paddd	g, [rsp + _DIGEST + 6*SZ4]
+	paddd	h, [rsp + _DIGEST + 7*SZ4]
+
+
+	sub	arg2, 1
+	jne	lloop
+
+	; write digests out
+	movdqa	[arg1+0*SZ4],a
+	movdqa	[arg1+1*SZ4],b
+	movdqa	[arg1+2*SZ4],c
+	movdqa	[arg1+3*SZ4],d
+	movdqa	[arg1+4*SZ4],e
+	movdqa	[arg1+5*SZ4],f
+	movdqa	[arg1+6*SZ4],g
+	movdqa	[arg1+7*SZ4],h
+
+	; update input pointers
+	add	inp0, IDX
+	mov	[arg1 + _data_ptr + 0*8], inp0
+	add	inp1, IDX
+	mov	[arg1 + _data_ptr + 1*8], inp1
+	add	inp2, IDX
+	mov	[arg1 + _data_ptr + 2*8], inp2
+	add	inp3, IDX
+	mov	[arg1 + _data_ptr + 3*8], inp3
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+	add	rsp, FRAMESZ
+	ret
+
+section .data align=64
+
+align 64
+TABLE:
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm
new file mode 100644
index 000000000..dbd9db1b8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm
@@ -0,0 +1,620 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute oct SHA256 using SSE-256 / AVX2
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; ymm0-15
+;; Windows clobbers:  rax rbx     rdx rsi rdi        r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves:         rcx             rbp r8
+;;
+;; Linux clobbers:    rax rbx rcx rdx rsi            r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves:                       rdi rbp r8
+;;
+;; clobbers ymm0-15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux definitions
+     %define arg1 	rdi
+     %define arg2	rsi
+     %define reg3	rcx
+     %define reg4	rdx
+%else
+ ; Windows definitions
+     %define arg1 	rcx
+     %define arg2 	rdx
+     %define reg3	rsi
+     %define reg4	rdi
+%endif
+
+; Common definitions
+%define STATE    arg1
+%define INP_SIZE arg2
+
+%define IDX     rax
+%define ROUND	rbx
+%define TBL	reg3
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 reg4
+
+; ymm0	a
+; ymm1	b
+; ymm2	c
+; ymm3	d
+; ymm4	e
+; ymm5	f
+; ymm6	g	TMP0
+; ymm7	h	TMP1
+; ymm8	T1	TT0
+; ymm9		TT1
+; ymm10		TT2
+; ymm11		TT3
+; ymm12	a0	TT4
+; ymm13	a1	TT5
+; ymm14	a2	TT6
+; ymm15	TMP	TT7
+
+%define a ymm0
+%define b ymm1
+%define c ymm2
+%define d ymm3
+%define e ymm4
+%define f ymm5
+%define g ymm6
+%define h ymm7
+
+%define T1  ymm8
+
+%define a0 ymm12
+%define a1 ymm13
+%define a2 ymm14
+%define TMP ymm15
+
+%define TMP0 ymm6
+%define TMP1 ymm7
+
+%define TT0 ymm8
+%define TT1 ymm9
+%define TT2 ymm10
+%define TT3 ymm11
+%define TT4 ymm12
+%define TT5 ymm13
+%define TT6 ymm14
+%define TT7 ymm15
+
+%define SZ8	8*SHA256_DIGEST_WORD_SIZE	; Size of one vector register
+%define ROUNDS	64*SZ8
+%define PTR_SZ                  8
+%define SHA256_DIGEST_WORD_SIZE	4
+%define MAX_SHA256_LANES	8
+%define NUM_SHA256_DIGEST_WORDS	8
+%define SHA256_DIGEST_ROW_SIZE	(MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE)
+
+; Define stack usage
+
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESZ mod 32 must be 32-8 = 24
+struc stack_frame
+  .data		resb	16*SZ8
+  .digest	resb	8*SZ8
+  .ytmp		resb	4*SZ8
+  .rsp		resb	8
+endstruc
+%define FRAMESZ	stack_frame_size
+%define _DIGEST	stack_frame.digest
+%define _YTMP	stack_frame.ytmp
+%define _RSP_SAVE	stack_frame.rsp
+
+%define YTMP0	rsp + _YTMP + 0*SZ8
+%define YTMP1	rsp + _YTMP + 1*SZ8
+%define YTMP2	rsp + _YTMP + 2*SZ8
+%define YTMP3	rsp + _YTMP + 3*SZ8
+
+%define VMOVPS	vmovups
+
+; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+; "transpose" data in {r0...r7} using temps {t0...t1}
+; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {a7 a6 a5 a4   a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4   b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4   c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4   d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4   e3 e2 e1 e0}
+; r5 = {f7 f6 f5 f4   f3 f2 f1 f0}
+; r6 = {g7 g6 g5 g4   g3 g2 g1 g0}
+; r7 = {h7 h6 h5 h4   h3 h2 h1 h0}
+;
+; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
+;
+%macro TRANSPOSE8 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+	; process top half (r0..r3) {a...d}
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {b5 b4 a5 a4   b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {b7 b6 a7 a6   b3 b2 a3 a2}
+	vshufps %%t1, %%r2, %%r3, 0x44	; t1 = {d5 d4 c5 c4   d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {d7 d6 c7 c6   d3 d2 c3 c2}
+	vshufps	%%r3, %%t0, %%t1, 0xDD	; r3 = {d5 c5 b5 a5   d1 c1 b1 a1}
+	vshufps	%%r1, %%r0, %%r2, 0x88	; r1 = {d6 c6 b6 a6   d2 c2 b2 a2}
+	vshufps	%%r0, %%r0, %%r2, 0xDD	; r0 = {d7 c7 b7 a7   d3 c3 b3 a3}
+	vshufps	%%t0, %%t0, %%t1, 0x88	; t0 = {d4 c4 b4 a4   d0 c0 b0 a0}
+
+	; use r2 in place of t0
+	; process bottom half (r4..r7) {e...h}
+	vshufps	%%r2, %%r4, %%r5, 0x44	; r2 = {f5 f4 e5 e4   f1 f0 e1 e0}
+	vshufps	%%r4, %%r4, %%r5, 0xEE	; r4 = {f7 f6 e7 e6   f3 f2 e3 e2}
+	vshufps %%t1, %%r6, %%r7, 0x44	; t1 = {h5 h4 g5 g4   h1 h0 g1 g0}
+	vshufps	%%r6, %%r6, %%r7, 0xEE	; r6 = {h7 h6 g7 g6   h3 h2 g3 g2}
+	vshufps	%%r7, %%r2, %%t1, 0xDD	; r7 = {h5 g5 f5 e5   h1 g1 f1 e1}
+	vshufps	%%r5, %%r4, %%r6, 0x88	; r5 = {h6 g6 f6 e6   h2 g2 f2 e2}
+	vshufps	%%r4, %%r4, %%r6, 0xDD	; r4 = {h7 g7 f7 e7   h3 g3 f3 e3}
+	vshufps	%%t1, %%r2, %%t1, 0x88	; t1 = {h4 g4 f4 e4   h0 g0 f0 e0}
+
+	vperm2f128	%%r6, %%r5, %%r1, 0x13	; h6...a6
+	vperm2f128	%%r2, %%r5, %%r1, 0x02	; h2...a2
+	vperm2f128	%%r5, %%r7, %%r3, 0x13	; h5...a5
+	vperm2f128	%%r1, %%r7, %%r3, 0x02	; h1...a1
+	vperm2f128	%%r7, %%r4, %%r0, 0x13	; h7...a7
+	vperm2f128	%%r3, %%r4, %%r0, 0x02	; h3...a3
+	vperm2f128	%%r4, %%t1, %%t0, 0x13	; h4...a4
+	vperm2f128	%%r0, %%t1, %%t0, 0x02	; h0...a0
+%endmacro
+
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpslld	%%tmp, %%reg, (32-(%%imm))
+	vpsrld	%%reg, %%reg, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	vpslld	%%tmp, %%src, (32-(%%imm))
+	vpsrld	%%reg, %%src, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+	PRORD	%1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+	PRORD_nd	%1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i  %2
+	PRORD_nd	a0, e, (11-6)	; sig1: a0 = (e >> 5)
+
+	vpxor	a2, f, g	; ch: a2 = f^g
+	vpand	a2, a2, e		; ch: a2 = (f^g)&e
+	vpxor	a2, a2, g		; a2 = ch
+
+	PRORD_nd	a1, e, 25		; sig1: a1 = (e >> 25)
+	vmovdqa	[SZ8*(%%i&0xf) + rsp], %%T1
+	vpaddd	%%T1, %%T1, [TBL + ROUND]	; T1 = W + K
+	vpxor	a0, a0, e	; sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		; sig1: a0 = (e >> 6) ^ (e >> 11)
+	vpaddd	h, h, a2	; h = h + ch
+	PRORD_nd	a2, a, (13-2)	; sig0: a2 = (a >> 11)
+	vpaddd	h, h, %%T1	; h = h + ch + W + K
+	vpxor	a0, a0, a1	; a0 = sigma1
+	PRORD_nd	a1, a, 22	; sig0: a1 = (a >> 22)
+	vpxor	%%T1, a, c	; maj: T1 = a^c
+	add	ROUND, SZ8	; ROUND++
+	vpand	%%T1, %%T1, b	; maj: T1 = (a^c)&b
+	vpaddd	h, h, a0
+
+	vpaddd	d, d, h
+
+	vpxor	a2, a2, a	; sig0: a2 = a ^ (a >> 11)
+	PRORD	a2, 2		; sig0: a2 = (a >> 2) ^ (a >> 13)
+	vpxor	a2, a2, a1	; a2 = sig0
+	vpand	a1, a, c	; maj: a1 = a&c
+	vpor	a1, a1, %%T1	; a1 = maj
+	vpaddd	h, h, a1	; h = h + ch + W + K + maj
+	vpaddd	h, h, a2	; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i  %2
+	vmovdqa	%%T1, [SZ8*((%%i-15)&0xf) + rsp]
+	vmovdqa	a1, [SZ8*((%%i-2)&0xf) + rsp]
+	vmovdqa	a0, %%T1
+	PRORD	%%T1, 18-7
+	vmovdqa	a2, a1
+	PRORD	a1, 19-17
+	vpxor	%%T1, %%T1, a0
+	PRORD	%%T1, 7
+	vpxor	a1, a1, a2
+	PRORD	a1, 17
+	vpsrld	a0, a0, 3
+	vpxor	%%T1, %%T1, a0
+	vpsrld	a2, a2, 10
+	vpxor	a1, a1, a2
+	vpaddd	%%T1, %%T1, [SZ8*((%%i-16)&0xf) + rsp]
+	vpaddd	a1, a1, [SZ8*((%%i-7)&0xf) + rsp]
+	vpaddd	%%T1, %%T1, a1
+
+	ROUND_00_15 %%T1, %%i
+
+%endm
+
+
+;; void sha256_x8_avx2(SHA256_ARGS *args, uint64_t bytes);
+;; arg 1 : STATE : pointer to input data
+;; arg 2 : INP_SIZE  : size of input in blocks
+mk_global sha256_mb_x8_avx2, function, internal
+align 16
+sha256_mb_x8_avx2:
+	endbranch
+	; general registers preserved in outer calling routine
+	; outer calling routine saves all the XMM registers
+
+	; save rsp, allocate 32-byte aligned for local variables
+	mov	IDX, rsp
+	sub	rsp, FRAMESZ
+	and	rsp, ~31
+	mov	[rsp + _RSP_SAVE], IDX
+
+
+	;; Load the pre-transposed incoming digest.
+	vmovdqu	a,[STATE + 0*SHA256_DIGEST_ROW_SIZE]
+	vmovdqu	b,[STATE + 1*SHA256_DIGEST_ROW_SIZE]
+	vmovdqu	c,[STATE + 2*SHA256_DIGEST_ROW_SIZE]
+	vmovdqu	d,[STATE + 3*SHA256_DIGEST_ROW_SIZE]
+	vmovdqu	e,[STATE + 4*SHA256_DIGEST_ROW_SIZE]
+	vmovdqu	f,[STATE + 5*SHA256_DIGEST_ROW_SIZE]
+	vmovdqu	g,[STATE + 6*SHA256_DIGEST_ROW_SIZE]
+	vmovdqu	h,[STATE + 7*SHA256_DIGEST_ROW_SIZE]
+
+	lea	TBL,[K256_8_MB]
+
+	;; load the address of each of the 4 message lanes
+	;; getting ready to transpose input onto stack
+	mov	inp0,[STATE + _args_data_ptr + 0*PTR_SZ]
+	mov	inp1,[STATE + _args_data_ptr + 1*PTR_SZ]
+	mov	inp2,[STATE + _args_data_ptr + 2*PTR_SZ]
+	mov	inp3,[STATE + _args_data_ptr + 3*PTR_SZ]
+	mov	inp4,[STATE + _args_data_ptr + 4*PTR_SZ]
+	mov	inp5,[STATE + _args_data_ptr + 5*PTR_SZ]
+	mov	inp6,[STATE + _args_data_ptr + 6*PTR_SZ]
+	mov	inp7,[STATE + _args_data_ptr + 7*PTR_SZ]
+
+	xor	IDX, IDX
+lloop:
+	xor	ROUND, ROUND
+
+	;; save old digest
+	vmovdqa	[rsp + _DIGEST + 0*SZ8], a
+	vmovdqa	[rsp + _DIGEST + 1*SZ8], b
+	vmovdqa	[rsp + _DIGEST + 2*SZ8], c
+	vmovdqa	[rsp + _DIGEST + 3*SZ8], d
+	vmovdqa	[rsp + _DIGEST + 4*SZ8], e
+	vmovdqa	[rsp + _DIGEST + 5*SZ8], f
+	vmovdqa	[rsp + _DIGEST + 6*SZ8], g
+	vmovdqa	[rsp + _DIGEST + 7*SZ8], h
+%assign i 0
+%rep 2
+	VMOVPS	TT0,[inp0+IDX+i*32]
+	VMOVPS	TT1,[inp1+IDX+i*32]
+	VMOVPS	TT2,[inp2+IDX+i*32]
+	VMOVPS	TT3,[inp3+IDX+i*32]
+	VMOVPS	TT4,[inp4+IDX+i*32]
+	VMOVPS	TT5,[inp5+IDX+i*32]
+	VMOVPS	TT6,[inp6+IDX+i*32]
+	VMOVPS	TT7,[inp7+IDX+i*32]
+	vmovdqa	[YTMP0], g
+	vmovdqa	[YTMP1], h
+	TRANSPOSE8	TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7,   TMP0, TMP1
+	vmovdqa	TMP1, [PSHUFFLE_BYTE_FLIP_MASK]
+	vmovdqa	g, [YTMP0]
+	vpshufb	TT0, TT0, TMP1
+	vpshufb	TT1, TT1, TMP1
+	vpshufb	TT2, TT2, TMP1
+	vpshufb	TT3, TT3, TMP1
+	vpshufb	TT4, TT4, TMP1
+	vpshufb	TT5, TT5, TMP1
+	vpshufb	TT6, TT6, TMP1
+	vpshufb	TT7, TT7, TMP1
+	vmovdqa	h, [YTMP1]
+	vmovdqa	[YTMP0], TT4
+	vmovdqa	[YTMP1], TT5
+	vmovdqa	[YTMP2], TT6
+	vmovdqa	[YTMP3], TT7
+	ROUND_00_15	TT0,(i*8+0)
+	vmovdqa	TT0, [YTMP0]
+	ROUND_00_15	TT1,(i*8+1)
+	vmovdqa	TT1, [YTMP1]
+	ROUND_00_15	TT2,(i*8+2)
+	vmovdqa	TT2, [YTMP2]
+	ROUND_00_15	TT3,(i*8+3)
+	vmovdqa	TT3, [YTMP3]
+	ROUND_00_15	TT0,(i*8+4)
+	ROUND_00_15	TT1,(i*8+5)
+	ROUND_00_15	TT2,(i*8+6)
+	ROUND_00_15	TT3,(i*8+7)
+%assign i (i+1)
+%endrep
+	add	IDX, 4*4*4
+
+%assign i (i*8)
+
+	jmp	Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+	ROUND_16_XX	T1, i
+%assign i (i+1)
+%endrep
+
+	cmp	ROUND,ROUNDS
+	jb	Lrounds_16_xx
+
+	;; add old digest
+	vpaddd	a, a, [rsp + _DIGEST + 0*SZ8]
+	vpaddd	b, b, [rsp + _DIGEST + 1*SZ8]
+	vpaddd	c, c, [rsp + _DIGEST + 2*SZ8]
+	vpaddd	d, d, [rsp + _DIGEST + 3*SZ8]
+	vpaddd	e, e, [rsp + _DIGEST + 4*SZ8]
+	vpaddd	f, f, [rsp + _DIGEST + 5*SZ8]
+	vpaddd	g, g, [rsp + _DIGEST + 6*SZ8]
+	vpaddd	h, h, [rsp + _DIGEST + 7*SZ8]
+
+	sub	INP_SIZE, 1  ;; unit is blocks
+	jne	lloop
+
+	; write back to memory (state object) the transposed digest
+	vmovdqu	[STATE + 0*SHA256_DIGEST_ROW_SIZE],a
+	vmovdqu	[STATE + 1*SHA256_DIGEST_ROW_SIZE],b
+	vmovdqu	[STATE + 2*SHA256_DIGEST_ROW_SIZE],c
+	vmovdqu	[STATE + 3*SHA256_DIGEST_ROW_SIZE],d
+	vmovdqu	[STATE + 4*SHA256_DIGEST_ROW_SIZE],e
+	vmovdqu	[STATE + 5*SHA256_DIGEST_ROW_SIZE],f
+	vmovdqu	[STATE + 6*SHA256_DIGEST_ROW_SIZE],g
+	vmovdqu	[STATE + 7*SHA256_DIGEST_ROW_SIZE],h
+
+	; update input pointers
+	add	inp0, IDX
+	mov	[STATE + _args_data_ptr + 0*8], inp0
+	add	inp1, IDX
+	mov	[STATE + _args_data_ptr + 1*8], inp1
+	add	inp2, IDX
+	mov	[STATE + _args_data_ptr + 2*8], inp2
+	add	inp3, IDX
+	mov	[STATE + _args_data_ptr + 3*8], inp3
+	add	inp4, IDX
+	mov	[STATE + _args_data_ptr + 4*8], inp4
+	add	inp5, IDX
+	mov	[STATE + _args_data_ptr + 5*8], inp5
+	add	inp6, IDX
+	mov	[STATE + _args_data_ptr + 6*8], inp6
+	add	inp7, IDX
+	mov	[STATE + _args_data_ptr + 7*8], inp7
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+	mov	rsp, [rsp + _RSP_SAVE]
+	ret
+
+section .data
+align 64
+K256_8_MB:
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x428a2f98428a2f98, 0x428a2f98428a2f98
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0x7137449171374491, 0x7137449171374491
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x3956c25b3956c25b, 0x3956c25b3956c25b
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x59f111f159f111f1, 0x59f111f159f111f1
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0x923f82a4923f82a4, 0x923f82a4923f82a4
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0xd807aa98d807aa98, 0xd807aa98d807aa98
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x12835b0112835b01, 0x12835b0112835b01
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x243185be243185be, 0x243185be243185be
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x72be5d7472be5d74, 0x72be5d7472be5d74
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xc19bf174c19bf174, 0xc19bf174c19bf174
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0xefbe4786efbe4786, 0xefbe4786efbe4786
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x76f988da76f988da, 0x76f988da76f988da
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0x983e5152983e5152, 0x983e5152983e5152
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xa831c66da831c66d, 0xa831c66da831c66d
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xb00327c8b00327c8, 0xb00327c8b00327c8
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0xd5a79147d5a79147, 0xd5a79147d5a79147
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x06ca635106ca6351, 0x06ca635106ca6351
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x1429296714292967, 0x1429296714292967
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x27b70a8527b70a85, 0x27b70a8527b70a85
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x53380d1353380d13, 0x53380d1353380d13
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x650a7354650a7354, 0x650a7354650a7354
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x766a0abb766a0abb, 0x766a0abb766a0abb
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0x92722c8592722c85, 0x92722c8592722c85
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xa81a664ba81a664b, 0xa81a664ba81a664b
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd192e819d192e819, 0xd192e819d192e819
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xd6990624d6990624, 0xd6990624d6990624
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0xf40e3585f40e3585, 0xf40e3585f40e3585
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x106aa070106aa070, 0x106aa070106aa070
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x19a4c11619a4c116, 0x19a4c11619a4c116
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x1e376c081e376c08, 0x1e376c081e376c08
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x2748774c2748774c, 0x2748774c2748774c
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x748f82ee748f82ee, 0x748f82ee748f82ee
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x78a5636f78a5636f, 0x78a5636f78a5636f
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x84c8781484c87814, 0x84c8781484c87814
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x8cc702088cc70208, 0x8cc702088cc70208
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0x90befffa90befffa, 0x90befffa90befffa
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+	dq	0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm
new file mode 100644
index 000000000..af54f7cc3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm
@@ -0,0 +1,125 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+default rel
+[bits 64]
+
+; declare the L3 ctx level symbols (these will then call the appropriate
+; L2 symbols)
+extern sha256_ctx_mgr_init_sse
+extern sha256_ctx_mgr_submit_sse
+extern sha256_ctx_mgr_flush_sse
+
+extern sha256_ctx_mgr_init_avx
+extern sha256_ctx_mgr_submit_avx
+extern sha256_ctx_mgr_flush_avx
+
+extern sha256_ctx_mgr_init_avx2
+extern sha256_ctx_mgr_submit_avx2
+extern sha256_ctx_mgr_flush_avx2
+
+extern sha256_ctx_mgr_init_base
+extern sha256_ctx_mgr_submit_base
+extern sha256_ctx_mgr_flush_base
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern sha256_ctx_mgr_init_avx512
+ extern sha256_ctx_mgr_submit_avx512
+ extern sha256_ctx_mgr_flush_avx512
+%endif
+
+%ifdef HAVE_AS_KNOWS_SHANI
+ extern sha256_ctx_mgr_init_sse_ni
+ extern sha256_ctx_mgr_submit_sse_ni
+ extern sha256_ctx_mgr_flush_sse_ni
+%endif
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ %ifdef HAVE_AS_KNOWS_SHANI
+  extern sha256_ctx_mgr_init_avx512_ni
+  extern sha256_ctx_mgr_submit_avx512_ni
+  extern sha256_ctx_mgr_flush_avx512_ni
+ %endif
+%endif
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface sha256_ctx_mgr_init
+mbin_interface sha256_ctx_mgr_submit
+mbin_interface sha256_ctx_mgr_flush
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ ; Reuse mbin_dispatch_init6's extension through replacing base by sse version
+ %ifdef HAVE_AS_KNOWS_SHANI
+  mbin_dispatch_base_to_avx512_shani sha256_ctx_mgr_init, sha256_ctx_mgr_init_base, \
+	sha256_ctx_mgr_init_sse, sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, \
+	sha256_ctx_mgr_init_avx512, sha256_ctx_mgr_init_sse_ni, sha256_ctx_mgr_init_avx512_ni
+  mbin_dispatch_base_to_avx512_shani sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_base, \
+	sha256_ctx_mgr_submit_sse, sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, \
+	sha256_ctx_mgr_submit_avx512, sha256_ctx_mgr_submit_sse_ni, sha256_ctx_mgr_submit_avx512_ni
+  mbin_dispatch_base_to_avx512_shani sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_base, \
+	sha256_ctx_mgr_flush_sse, sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, \
+	sha256_ctx_mgr_flush_avx512, sha256_ctx_mgr_flush_sse_ni, sha256_ctx_mgr_flush_avx512_ni
+ %else
+  mbin_dispatch_init6 sha256_ctx_mgr_init, sha256_ctx_mgr_init_base, \
+	sha256_ctx_mgr_init_sse, sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, \
+	sha256_ctx_mgr_init_avx512
+  mbin_dispatch_init6 sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_base, \
+	sha256_ctx_mgr_submit_sse, sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, \
+	sha256_ctx_mgr_submit_avx512
+  mbin_dispatch_init6 sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_base, \
+	sha256_ctx_mgr_flush_sse, sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, \
+	sha256_ctx_mgr_flush_avx512
+ %endif
+%else
+ %ifdef HAVE_AS_KNOWS_SHANI
+  mbin_dispatch_sse_to_avx2_shani sha256_ctx_mgr_init, sha256_ctx_mgr_init_sse, \
+	sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, sha256_ctx_mgr_init_sse_ni
+  mbin_dispatch_sse_to_avx2_shani sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_sse, \
+	sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, sha256_ctx_mgr_submit_sse_ni
+  mbin_dispatch_sse_to_avx2_shani sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_sse, \
+	sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, sha256_ctx_mgr_flush_sse_ni
+ %else
+  mbin_dispatch_init sha256_ctx_mgr_init, sha256_ctx_mgr_init_sse, \
+	sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2
+  mbin_dispatch_init sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_sse, \
+	sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2
+  mbin_dispatch_init sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_sse, \
+	sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2
+ %endif
+%endif
+
+;;;       func  			core, ver, snum
+slversion sha256_ctx_mgr_init,  	00,   04,  0160
+slversion sha256_ctx_mgr_submit,	00,   04,  0161
+slversion sha256_ctx_mgr_flush, 	00,   04,  0162
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm
new file mode 100644
index 000000000..25fc9ce16
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm
@@ -0,0 +1,361 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+%endif
+
+%define MSG     	xmm0
+%define STATE0  	xmm1
+%define STATE1  	xmm2
+%define MSGTMP0 	xmm3
+%define MSGTMP1 	xmm4
+%define MSGTMP2 	xmm5
+%define MSGTMP3 	xmm6
+%define MSGTMP4 	xmm7
+
+%define SHUF_MASK       xmm8
+
+%define ABEF_SAVE       xmm9
+%define CDGH_SAVE       xmm10
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR     arg0
+%define NBLK    arg1
+%define NLANX4  r10     ; consistent with caller
+%define IDX     r8      ; local variable -- consistent with caller
+%define DPTR    r11     ; local variable -- input buffer pointer
+%define TMP     r9      ; local variable -- assistant to address digest
+%define TBL     rax
+;%define TMP2   r8      ; local variable -- assistant to address digest
+align 32
+
+; void sha256_ni_x1(SHA256_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; 		 (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: rax, r9~r11, xmm0-xmm10
+;
+mk_global sha256_ni_x1, function, internal
+sha256_ni_x1:
+	endbranch
+	shl     NBLK, 6 	; transform blk amount into bytes
+	jz      backto_mgr
+
+	; detach idx from nlanx4
+	mov     IDX, NLANX4
+	shr     NLANX4, 8
+	and     IDX, 0xff
+
+	lea     TMP, [MGR + 4*IDX]
+	;; Initialize digest
+	;; digests -> ABEF(state0), CDGH(state1)
+	pinsrd  STATE0, [TMP + 0*NLANX4], 3     ; A
+	pinsrd  STATE0, [TMP + 1*NLANX4], 2     ; B
+	pinsrd  STATE1, [TMP + 2*NLANX4], 3     ; C
+	lea     TMP, [TMP + 2*NLANX4]   ; MGR + 4*IDX + 2*NLANX4
+	pinsrd  STATE1, [TMP + 1*NLANX4], 2     ; D
+	pinsrd  STATE0, [TMP + 2*NLANX4], 1     ; E
+	pinsrd  STATE1, [TMP + 4*NLANX4], 1     ; G
+	lea     TMP, [TMP + 1*NLANX4]   ; MGR + 4*IDX + 6*NLANX4
+	pinsrd  STATE0, [TMP + 2*NLANX4], 0     ; F
+	pinsrd  STATE1, [TMP + 4*NLANX4], 0     ; H
+
+	movdqa  SHUF_MASK, [PSHUFFLE_SHANI_MASK]
+	lea     TBL, [TABLE]
+
+	;; Load input pointers
+	mov     DPTR, [MGR + _data_ptr + IDX*8]
+	;; nblk is used to indicate data end
+	add     NBLK, DPTR
+
+lloop:
+	; /* Save hash values for addition after rounds */
+	movdqa  	ABEF_SAVE, STATE0
+	movdqa  	CDGH_SAVE, STATE1
+
+	; /* Rounds 0-3 */
+	movdqu  	MSG, [DPTR + 0*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP0, MSG
+		paddd   	MSG, [TBL + 0*16]
+		sha256rnds2     STATE1, STATE0, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+
+	; /* Rounds 4-7 */
+	movdqu  	MSG, [DPTR + 1*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP1, MSG
+		paddd   	MSG, [TBL + 1*16]
+		sha256rnds2     STATE1, STATE0, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP0, MSGTMP1
+
+	; /* Rounds 8-11 */
+	movdqu  	MSG, [DPTR + 2*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP2, MSG
+		paddd   	MSG, [TBL + 2*16]
+		sha256rnds2     STATE1, STATE0, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP1, MSGTMP2
+
+	; /* Rounds 12-15 */
+	movdqu  	MSG, [DPTR + 3*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP3, MSG
+		paddd   	MSG, [TBL + 3*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP3
+	palignr 	MSGTMP4, MSGTMP2, 4
+	paddd   	MSGTMP0, MSGTMP4
+	sha256msg2      MSGTMP0, MSGTMP3
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP2, MSGTMP3
+
+	; /* Rounds 16-19 */
+	movdqa  	MSG, MSGTMP0
+		paddd   	MSG, [TBL + 4*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP0
+	palignr 	MSGTMP4, MSGTMP3, 4
+	paddd   	MSGTMP1, MSGTMP4
+	sha256msg2      MSGTMP1, MSGTMP0
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP3, MSGTMP0
+
+	; /* Rounds 20-23 */
+	movdqa  	MSG, MSGTMP1
+		paddd   	MSG, [TBL + 5*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP1
+	palignr 	MSGTMP4, MSGTMP0, 4
+	paddd   	MSGTMP2, MSGTMP4
+	sha256msg2      MSGTMP2, MSGTMP1
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP0, MSGTMP1
+
+	; /* Rounds 24-27 */
+	movdqa  	MSG, MSGTMP2
+		paddd   	MSG, [TBL + 6*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP2
+	palignr 	MSGTMP4, MSGTMP1, 4
+	paddd   	MSGTMP3, MSGTMP4
+	sha256msg2      MSGTMP3, MSGTMP2
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP1, MSGTMP2
+
+	; /* Rounds 28-31 */
+	movdqa  	MSG, MSGTMP3
+		paddd   	MSG, [TBL + 7*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP3
+	palignr 	MSGTMP4, MSGTMP2, 4
+	paddd   	MSGTMP0, MSGTMP4
+	sha256msg2      MSGTMP0, MSGTMP3
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP2, MSGTMP3
+
+	; /* Rounds 32-35 */
+	movdqa  	MSG, MSGTMP0
+		paddd   	MSG, [TBL + 8*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP0
+	palignr 	MSGTMP4, MSGTMP3, 4
+	paddd   	MSGTMP1, MSGTMP4
+	sha256msg2      MSGTMP1, MSGTMP0
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP3, MSGTMP0
+
+	; /* Rounds 36-39 */
+	movdqa  	MSG, MSGTMP1
+		paddd   	MSG, [TBL + 9*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP1
+	palignr 	MSGTMP4, MSGTMP0, 4
+	paddd   	MSGTMP2, MSGTMP4
+	sha256msg2      MSGTMP2, MSGTMP1
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP0, MSGTMP1
+
+	; /* Rounds 40-43 */
+	movdqa  	MSG, MSGTMP2
+		paddd   	MSG, [TBL + 10*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP2
+	palignr 	MSGTMP4, MSGTMP1, 4
+	paddd   	MSGTMP3, MSGTMP4
+	sha256msg2      MSGTMP3, MSGTMP2
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP1, MSGTMP2
+
+	; /* Rounds 44-47 */
+	movdqa  	MSG, MSGTMP3
+		paddd   	MSG, [TBL + 11*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP3
+	palignr 	MSGTMP4, MSGTMP2, 4
+	paddd   	MSGTMP0, MSGTMP4
+	sha256msg2      MSGTMP0, MSGTMP3
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP2, MSGTMP3
+
+	; /* Rounds 48-51 */
+	movdqa  	MSG, MSGTMP0
+		paddd   	MSG, [TBL + 12*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP0
+	palignr 	MSGTMP4, MSGTMP3, 4
+	paddd   	MSGTMP1, MSGTMP4
+	sha256msg2      MSGTMP1, MSGTMP0
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP3, MSGTMP0
+
+	; /* Rounds 52-55 */
+	movdqa  	MSG, MSGTMP1
+		paddd   	MSG, [TBL + 13*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP1
+	palignr 	MSGTMP4, MSGTMP0, 4
+	paddd   	MSGTMP2, MSGTMP4
+	sha256msg2      MSGTMP2, MSGTMP1
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+
+	; /* Rounds 56-59 */
+	movdqa  	MSG, MSGTMP2
+		paddd   	MSG, [TBL + 14*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP2
+	palignr 	MSGTMP4, MSGTMP1, 4
+	paddd   	MSGTMP3, MSGTMP4
+	sha256msg2      MSGTMP3, MSGTMP2
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+
+	; /* Rounds 60-63 */
+	movdqa  	MSG, MSGTMP3
+		paddd   	MSG, [TBL + 15*16]
+		sha256rnds2     STATE1, STATE0, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+
+	; /* Add current hash values with previously saved */
+	paddd   	STATE0, ABEF_SAVE
+	paddd   	STATE1, CDGH_SAVE
+
+	; Increment data pointer and loop if more to process
+	add     	DPTR, 64
+	cmp     	DPTR, NBLK
+	jne     	lloop
+
+	; write out digests
+	lea     TMP, [MGR + 4*IDX]
+	;; ABEF(state0), CDGH(state1) -> digests
+	pextrd  [TMP + 0*NLANX4], STATE0, 3     ; A
+	pextrd  [TMP + 1*NLANX4], STATE0, 2     ; B
+	pextrd  [TMP + 2*NLANX4], STATE1, 3     ; C
+	lea     TMP, [TMP + 2*NLANX4]   ; MGR + 4*IDX + 2*NLANX4
+	pextrd  [TMP + 1*NLANX4], STATE1, 2     ; D
+	pextrd  [TMP + 2*NLANX4], STATE0, 1     ; E
+	pextrd  [TMP + 4*NLANX4], STATE1, 1     ; G
+	lea     TMP, [TMP + 1*NLANX4]   ; MGR + 4*IDX + 6*NLANX4
+	pextrd  [TMP + 2*NLANX4], STATE0, 0     ; F
+	pextrd  [TMP + 4*NLANX4], STATE1, 0     ; H
+
+	; update input pointers
+	mov     [MGR + _data_ptr + IDX*8], DPTR
+
+backto_mgr:
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+	ret
+
+
+section .data align=16
+PSHUFFLE_SHANI_MASK:    dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+TABLE:	dd	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	dd      0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	dd      0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	dd      0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	dd      0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	dd      0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	dd      0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	dd      0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	dd      0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	dd      0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	dd      0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	dd      0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	dd      0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	dd      0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	dd      0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	dd      0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_ni_x1
+no_sha256_ni_x1:
+%endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm
new file mode 100644
index 000000000..74cfc93b6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm
@@ -0,0 +1,574 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+%endif
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define FRAMESZ	64		; space for ABCDE
+%define RSPSAVE	rax
+
+%define MSG     	xmm0
+%define STATE0  	xmm1
+%define STATE1  	xmm2
+%define MSGTMP0 	xmm3
+%define MSGTMP1 	xmm4
+%define MSGTMP2 	xmm5
+%define MSGTMP3 	xmm6
+%define MSGTMP4 	xmm7
+
+%define STATE0b		xmm8
+%define STATE1b		xmm9
+%define MSGTMP0b	xmm10
+%define MSGTMP1b	xmm11
+%define MSGTMP2b	xmm12
+%define MSGTMP3b	xmm13
+%define MSGTMP4b	xmm14
+
+%define SHUF_MASK       xmm15
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR     arg0
+%define NBLK    arg1
+%define NLANX4  r10     ; consistent with caller
+%define IDX     r8      ; local variable -- consistent with caller
+%define DPTR    r11     ; local variable -- input buffer pointer
+%define DPTRb   r12
+%define TMP     r9      ; local variable -- assistant to address digest
+%define TBL     r13
+%define TMPb    r14      ; local variable -- assistant to address digest
+align 32
+
+; void sha256_ni_x2(SHA256_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; 		 (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: rax, r9~r14, xmm0-xmm15
+;
+mk_global sha256_ni_x2, function, internal
+sha256_ni_x2:
+	endbranch
+	mov 	RSPSAVE, rsp
+	sub 	rsp, FRAMESZ
+	and 	rsp, ~0xF	; Align 16Bytes downward
+
+	shl     NBLK, 6		; transform blk amount into bytes
+	jz      backto_mgr
+
+	; detach idx from nlanx4
+	mov     IDX, NLANX4
+	shr     NLANX4, 8
+	and     IDX, 0xff
+
+	lea     TMP, [MGR + 4*0]
+	lea     TMPb, [MGR + 4*1]
+
+	;; Initialize digest
+	;; digests -> ABEF(state0), CDGH(state1)
+	pinsrd  STATE0, [TMP + 0*NLANX4], 3     ; A
+	pinsrd  STATE0, [TMP + 1*NLANX4], 2     ; B
+	pinsrd  STATE1, [TMP + 2*NLANX4], 3     ; C
+	lea     TMP, [TMP + 2*NLANX4]   ; MGR + 4*IDX + 2*NLANX4
+	pinsrd  STATE1, [TMP + 1*NLANX4], 2     ; D
+	pinsrd  STATE0, [TMP + 2*NLANX4], 1     ; E
+	pinsrd  STATE1, [TMP + 4*NLANX4], 1     ; G
+	lea     TMP, [TMP + 1*NLANX4]   ; MGR + 4*IDX + 6*NLANX4
+	pinsrd  STATE0, [TMP + 2*NLANX4], 0     ; F
+	pinsrd  STATE1, [TMP + 4*NLANX4], 0     ; H
+
+	pinsrd  STATE0b, [TMPb + 0*NLANX4], 3     ; A
+	pinsrd  STATE0b, [TMPb + 1*NLANX4], 2     ; B
+	pinsrd  STATE1b, [TMPb + 2*NLANX4], 3     ; C
+	lea     TMPb, [TMPb + 2*NLANX4]   ; MGR + 4*IDX + 2*NLANX4
+	pinsrd  STATE1b, [TMPb + 1*NLANX4], 2     ; D
+	pinsrd  STATE0b, [TMPb + 2*NLANX4], 1     ; E
+	pinsrd  STATE1b, [TMPb + 4*NLANX4], 1     ; G
+	lea     TMPb, [TMPb + 1*NLANX4]   ; MGR + 4*IDX + 6*NLANX4
+	pinsrd  STATE0b, [TMPb + 2*NLANX4], 0     ; F
+	pinsrd  STATE1b, [TMPb + 4*NLANX4], 0     ; H
+
+	movdqa  SHUF_MASK, [PSHUFFLE_SHANI_MASK]
+	lea     TBL, [TABLE]
+
+	;; Load input pointers
+	mov     DPTR, [MGR + _data_ptr + 8*0]
+	mov     DPTRb,[MGR + _data_ptr + 8*1]
+	;; nblk is used to indicate data end
+	add     NBLK, DPTR
+
+lloop:
+	; /* Save hash values for addition after rounds */
+	movdqa		[rsp + 0*16], STATE0
+	movdqa		[rsp + 1*16], STATE1
+
+	movdqa		[rsp + 2*16], STATE0b
+	movdqa		[rsp + 3*16], STATE1b
+
+	; /* Rounds 0-3 */
+	movdqu  	MSG, [DPTR + 0*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP0, MSG
+		paddd   	MSG, [TBL + 0*16]
+		sha256rnds2     STATE1, STATE0, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+
+	movdqu  	MSG, [DPTRb + 0*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP0b, MSG
+		paddd   	MSG, [TBL + 0*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+
+	; /* Rounds 4-7 */
+	movdqu  	MSG, [DPTR + 1*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP1, MSG
+		paddd   	MSG, [TBL + 1*16]
+		sha256rnds2     STATE1, STATE0, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP0, MSGTMP1
+
+	movdqu  	MSG, [DPTRb + 1*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP1b, MSG
+		paddd   	MSG, [TBL + 1*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP0b, MSGTMP1b
+
+	; /* Rounds 8-11 */
+	movdqu  	MSG, [DPTR + 2*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP2, MSG
+		paddd   	MSG, [TBL + 2*16]
+		sha256rnds2     STATE1, STATE0, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP1, MSGTMP2
+
+	movdqu  	MSG, [DPTRb + 2*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP2b, MSG
+		paddd   	MSG, [TBL + 2*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP1b, MSGTMP2b
+
+	; /* Rounds 12-15 */
+	movdqu  	MSG, [DPTR + 3*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP3, MSG
+		paddd   	MSG, [TBL + 3*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP3
+	palignr 	MSGTMP4, MSGTMP2, 4
+	paddd   	MSGTMP0, MSGTMP4
+	sha256msg2      MSGTMP0, MSGTMP3
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP2, MSGTMP3
+
+	movdqu  	MSG, [DPTRb + 3*16]
+	pshufb  	MSG, SHUF_MASK
+	movdqa  	MSGTMP3b, MSG
+		paddd   	MSG, [TBL + 3*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP3b
+	palignr 	MSGTMP4b, MSGTMP2b, 4
+	paddd   	MSGTMP0b, MSGTMP4b
+	sha256msg2      MSGTMP0b, MSGTMP3b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP2b, MSGTMP3b
+
+	; /* Rounds 16-19 */
+	movdqa  	MSG, MSGTMP0
+		paddd   	MSG, [TBL + 4*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP0
+	palignr 	MSGTMP4, MSGTMP3, 4
+	paddd   	MSGTMP1, MSGTMP4
+	sha256msg2      MSGTMP1, MSGTMP0
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP3, MSGTMP0
+
+	movdqa  	MSG, MSGTMP0b
+		paddd   	MSG, [TBL + 4*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP0b
+	palignr 	MSGTMP4b, MSGTMP3b, 4
+	paddd   	MSGTMP1b, MSGTMP4b
+	sha256msg2      MSGTMP1b, MSGTMP0b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP3b, MSGTMP0b
+
+	; /* Rounds 20-23 */
+	movdqa  	MSG, MSGTMP1
+		paddd   	MSG, [TBL + 5*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP1
+	palignr 	MSGTMP4, MSGTMP0, 4
+	paddd   	MSGTMP2, MSGTMP4
+	sha256msg2      MSGTMP2, MSGTMP1
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP0, MSGTMP1
+
+	movdqa  	MSG, MSGTMP1b
+		paddd   	MSG, [TBL + 5*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP1b
+	palignr 	MSGTMP4b, MSGTMP0b, 4
+	paddd   	MSGTMP2b, MSGTMP4b
+	sha256msg2      MSGTMP2b, MSGTMP1b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP0b, MSGTMP1b
+
+	; /* Rounds 24-27 */
+	movdqa  	MSG, MSGTMP2
+		paddd   	MSG, [TBL + 6*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP2
+	palignr 	MSGTMP4, MSGTMP1, 4
+	paddd   	MSGTMP3, MSGTMP4
+	sha256msg2      MSGTMP3, MSGTMP2
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP1, MSGTMP2
+
+	movdqa  	MSG, MSGTMP2b
+		paddd   	MSG, [TBL + 6*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP2b
+	palignr 	MSGTMP4b, MSGTMP1b, 4
+	paddd   	MSGTMP3b, MSGTMP4b
+	sha256msg2      MSGTMP3b, MSGTMP2b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP1b, MSGTMP2b
+
+	; /* Rounds 28-31 */
+	movdqa  	MSG, MSGTMP3
+		paddd   	MSG, [TBL + 7*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP3
+	palignr 	MSGTMP4, MSGTMP2, 4
+	paddd   	MSGTMP0, MSGTMP4
+	sha256msg2      MSGTMP0, MSGTMP3
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP2, MSGTMP3
+
+	movdqa  	MSG, MSGTMP3b
+		paddd   	MSG, [TBL + 7*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP3b
+	palignr 	MSGTMP4b, MSGTMP2b, 4
+	paddd   	MSGTMP0b, MSGTMP4b
+	sha256msg2      MSGTMP0b, MSGTMP3b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP2b, MSGTMP3b
+
+	; /* Rounds 32-35 */
+	movdqa  	MSG, MSGTMP0
+		paddd   	MSG, [TBL + 8*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP0
+	palignr 	MSGTMP4, MSGTMP3, 4
+	paddd   	MSGTMP1, MSGTMP4
+	sha256msg2      MSGTMP1, MSGTMP0
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP3, MSGTMP0
+
+	movdqa  	MSG, MSGTMP0b
+		paddd   	MSG, [TBL + 8*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP0b
+	palignr 	MSGTMP4b, MSGTMP3b, 4
+	paddd   	MSGTMP1b, MSGTMP4b
+	sha256msg2      MSGTMP1b, MSGTMP0b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP3b, MSGTMP0b
+
+	; /* Rounds 36-39 */
+	movdqa  	MSG, MSGTMP1
+		paddd   	MSG, [TBL + 9*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP1
+	palignr 	MSGTMP4, MSGTMP0, 4
+	paddd   	MSGTMP2, MSGTMP4
+	sha256msg2      MSGTMP2, MSGTMP1
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP0, MSGTMP1
+
+	movdqa  	MSG, MSGTMP1b
+		paddd   	MSG, [TBL + 9*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP1b
+	palignr 	MSGTMP4b, MSGTMP0b, 4
+	paddd   	MSGTMP2b, MSGTMP4b
+	sha256msg2      MSGTMP2b, MSGTMP1b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP0b, MSGTMP1b
+
+	; /* Rounds 40-43 */
+	movdqa  	MSG, MSGTMP2
+		paddd   	MSG, [TBL + 10*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP2
+	palignr 	MSGTMP4, MSGTMP1, 4
+	paddd   	MSGTMP3, MSGTMP4
+	sha256msg2      MSGTMP3, MSGTMP2
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP1, MSGTMP2
+
+	movdqa  	MSG, MSGTMP2b
+		paddd   	MSG, [TBL + 10*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP2b
+	palignr 	MSGTMP4b, MSGTMP1b, 4
+	paddd   	MSGTMP3b, MSGTMP4b
+	sha256msg2      MSGTMP3b, MSGTMP2b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP1b, MSGTMP2b
+
+	; /* Rounds 44-47 */
+	movdqa  	MSG, MSGTMP3
+		paddd   	MSG, [TBL + 11*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP3
+	palignr 	MSGTMP4, MSGTMP2, 4
+	paddd   	MSGTMP0, MSGTMP4
+	sha256msg2      MSGTMP0, MSGTMP3
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP2, MSGTMP3
+
+	movdqa  	MSG, MSGTMP3b
+		paddd   	MSG, [TBL + 11*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP3b
+	palignr 	MSGTMP4b, MSGTMP2b, 4
+	paddd   	MSGTMP0b, MSGTMP4b
+	sha256msg2      MSGTMP0b, MSGTMP3b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP2b, MSGTMP3b
+
+	; /* Rounds 48-51 */
+	movdqa  	MSG, MSGTMP0
+		paddd   	MSG, [TBL + 12*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP0
+	palignr 	MSGTMP4, MSGTMP3, 4
+	paddd   	MSGTMP1, MSGTMP4
+	sha256msg2      MSGTMP1, MSGTMP0
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+	sha256msg1      MSGTMP3, MSGTMP0
+
+	movdqa  	MSG, MSGTMP0b
+		paddd   	MSG, [TBL + 12*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP0b
+	palignr 	MSGTMP4b, MSGTMP3b, 4
+	paddd   	MSGTMP1b, MSGTMP4b
+	sha256msg2      MSGTMP1b, MSGTMP0b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+	sha256msg1      MSGTMP3b, MSGTMP0b
+
+	; /* Rounds 52-55 */
+	movdqa  	MSG, MSGTMP1
+		paddd   	MSG, [TBL + 13*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP1
+	palignr 	MSGTMP4, MSGTMP0, 4
+	paddd   	MSGTMP2, MSGTMP4
+	sha256msg2      MSGTMP2, MSGTMP1
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+
+	movdqa  	MSG, MSGTMP1b
+		paddd   	MSG, [TBL + 13*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP1b
+	palignr 	MSGTMP4b, MSGTMP0b, 4
+	paddd   	MSGTMP2b, MSGTMP4b
+	sha256msg2      MSGTMP2b, MSGTMP1b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+
+	; /* Rounds 56-59 */
+	movdqa  	MSG, MSGTMP2
+		paddd   	MSG, [TBL + 14*16]
+		sha256rnds2     STATE1, STATE0, MSG
+	movdqa  	MSGTMP4, MSGTMP2
+	palignr 	MSGTMP4, MSGTMP1, 4
+	paddd   	MSGTMP3, MSGTMP4
+	sha256msg2      MSGTMP3, MSGTMP2
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+
+	movdqa  	MSG, MSGTMP2b
+		paddd   	MSG, [TBL + 14*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+	movdqa  	MSGTMP4b, MSGTMP2b
+	palignr 	MSGTMP4b, MSGTMP1b, 4
+	paddd   	MSGTMP3b, MSGTMP4b
+	sha256msg2      MSGTMP3b, MSGTMP2b
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+
+	; /* Rounds 60-63 */
+	movdqa  	MSG, MSGTMP3
+		paddd   	MSG, [TBL + 15*16]
+		sha256rnds2     STATE1, STATE0, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0, STATE1, MSG
+
+	movdqa  	MSG, MSGTMP3b
+		paddd   	MSG, [TBL + 15*16]
+		sha256rnds2     STATE1b, STATE0b, MSG
+		pshufd  	MSG, MSG, 0x0E
+		sha256rnds2     STATE0b, STATE1b, MSG
+
+	; /* Add current hash values with previously saved */
+	paddd   	STATE0, [rsp + 0*16]
+	paddd   	STATE1, [rsp + 1*16]
+
+	paddd   	STATE0b, [rsp + 2*16]
+	paddd   	STATE1b, [rsp + 3*16]
+
+	; Increment data pointer and loop if more to process
+	add     	DPTR, 64
+	add     	DPTRb, 64
+	cmp     	DPTR, NBLK
+	jne     	lloop
+
+	; write out digests
+	lea     TMP, [MGR + 4*0]
+	;; ABEF(state0), CDGH(state1) -> digests
+	pextrd  [TMP + 0*NLANX4], STATE0, 3     ; A
+	pextrd  [TMP + 1*NLANX4], STATE0, 2     ; B
+	pextrd  [TMP + 2*NLANX4], STATE1, 3     ; C
+	lea     TMP, [TMP + 2*NLANX4]   ; MGR + 4*IDX + 2*NLANX4
+	pextrd  [TMP + 1*NLANX4], STATE1, 2     ; D
+	pextrd  [TMP + 2*NLANX4], STATE0, 1     ; E
+	pextrd  [TMP + 4*NLANX4], STATE1, 1     ; G
+	lea     TMP, [TMP + 1*NLANX4]   ; MGR + 4*IDX + 6*NLANX4
+	pextrd  [TMP + 2*NLANX4], STATE0, 0     ; F
+	pextrd  [TMP + 4*NLANX4], STATE1, 0     ; H
+
+	lea     TMPb, [MGR + 4*1]
+	;; ABEF(state0), CDGH(state1) -> digests
+	pextrd  [TMPb + 0*NLANX4], STATE0b, 3     ; A
+	pextrd  [TMPb + 1*NLANX4], STATE0b, 2     ; B
+	pextrd  [TMPb + 2*NLANX4], STATE1b, 3     ; C
+	lea     TMPb, [TMPb + 2*NLANX4]   ; MGR + 4*IDX + 2*NLANX4
+	pextrd  [TMPb + 1*NLANX4], STATE1b, 2     ; D
+	pextrd  [TMPb + 2*NLANX4], STATE0b, 1     ; E
+	pextrd  [TMPb + 4*NLANX4], STATE1b, 1     ; G
+	lea     TMPb, [TMPb + 1*NLANX4]   ; MGR + 4*IDX + 6*NLANX4
+	pextrd  [TMPb + 2*NLANX4], STATE0b, 0     ; F
+	pextrd  [TMPb + 4*NLANX4], STATE1b, 0     ; H
+
+	; update input pointers
+	mov     [MGR + _data_ptr + 0*8], DPTR
+	mov     [MGR + _data_ptr + 1*8], DPTRb
+
+backto_mgr:
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+	mov     rsp, RSPSAVE
+
+	ret
+
+section .data align=16
+PSHUFFLE_SHANI_MASK:    dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+TABLE:	dd	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	dd      0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	dd      0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	dd      0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	dd      0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	dd      0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	dd      0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	dd      0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	dd      0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	dd      0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	dd      0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	dd      0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	dd      0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	dd      0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	dd      0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	dd      0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_ni_x2
+no_sha256_ni_x2:
+%endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm
new file mode 100644
index 000000000..fc13ec279
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm
@@ -0,0 +1,567 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Implement fast SHA-256 with SSSE3 instructions. (x86_64)
+;
+; Copyright (C) 2013 Intel Corporation.
+;
+; Authors:
+;     James Guilford <james.guilford@intel.com>
+;     Kirk Yap <kirk.s.yap@intel.com>
+;     Tim Chen <tim.c.chen@linux.intel.com>
+; Transcoded by:
+;     Xiaodong Liu <xiaodong.liu@intel.com>
+;
+; This software is available to you under the OpenIB.org BSD license
+; below:
+;
+;     Redistribution and use in source and binary forms, with or
+;     without modification, are permitted provided that the following
+;     conditions are met:
+;
+;      - Redistributions of source code must retain the above
+;        copyright notice, this list of conditions and the following
+;        disclaimer.
+;
+;      - Redistributions in binary form must reproduce the above
+;        copyright notice, this list of conditions and the following
+;        disclaimer in the documentation and/or other materials
+;        provided with the distribution.
+;
+; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+; BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+; ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+; CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+; SOFTWARE.
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; This code is described in an Intel White-Paper:
+; "Fast SHA-256 Implementations on Intel Architecture Processors"
+;
+; To find it, surf to http://www.intel.com/p/en_US/embedded
+; and search for that title.
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0  rdi
+ %define arg1  rsi
+%else
+ ; Windows
+ %define arg0   rcx
+ %define arg1   rdx
+%endif
+
+%xdefine X0 xmm4
+%xdefine X1 xmm5
+%xdefine X2 xmm6
+%xdefine X3 xmm7
+
+%xdefine XTMP0 xmm0
+%xdefine XTMP1 xmm1
+%xdefine XTMP2 xmm2
+%xdefine XTMP3 xmm3
+%xdefine XTMP4 xmm8
+%xdefine XFER xmm9
+
+%define SHUF_00BA xmm10      ; shuffle xBxA -> 00BA
+%define SHUF_DC00 xmm11      ; shuffle xDxC -> DC00
+%define BYTE_FLIP_MASK xmm12
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR	arg0	; rdi or rcx
+%define NBLK	arg1	; rsi or rdx
+%define IDX	r8	; local variable -- consistent with caller
+%define NLANX4	r10	; consistent with caller, should be r10
+
+%define TMGR r9	; data pointer stored in stack named _TMGR
+%define INP r9	; data pointer stored in stack named _INP
+%define SRND r9	; clobbers INP
+%define TMP r9	; local variable -- assistant to address digest
+
+%xdefine TBL rbp
+%xdefine c ecx
+%xdefine d esi
+%xdefine e edx
+%xdefine a eax
+%xdefine b ebx
+
+%xdefine f edi
+%xdefine g r12d
+%xdefine h r11d
+
+%xdefine y0 r13d
+%xdefine y1 r14d
+%xdefine y2 r15d
+
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define _STACK_ALIGN_SIZE 8	; 0 or 8 depends on pushes
+%define _INP_END_SIZE 8
+%define _INP_SIZE 8
+%define _TMGR_SIZE 8
+%define _XFER_SIZE 16
+%define _XMM_SAVE_SIZE 0
+%define _GPR_SAVE_SIZE 8*9	;rbx, rdx, rbp, (rdi, rsi), r12~r15
+
+%define _STACK_ALIGN 0
+%define _INP_END (_STACK_ALIGN  + _STACK_ALIGN_SIZE)
+%define _INP (_INP_END  + _INP_END_SIZE)
+%define _TMGR (_INP + _INP_SIZE)
+%define _XFER (_TMGR + _TMGR_SIZE)
+%define _XMM_SAVE (_XFER + _XFER_SIZE)
+%define _GPR_SAVE (_XMM_SAVE + _XMM_SAVE_SIZE)
+%define STACK_SIZE (_GPR_SAVE + _GPR_SAVE_SIZE)
+
+;; assume buffers not aligned
+%define    MOVDQ movdqu
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+; addm [mem], reg
+; Add reg to mem using reg-mem add and store
+%macro addm 2
+        add     %2, %1 ;changed
+        mov     %1, %2 ;changed
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+; Load xmm with mem and byte swap each dword
+%macro COPY_XMM_AND_BSWAP 3
+        MOVDQ %1, %2 ;changed
+        pshufb %1, %3 ;changed
+%endmacro
+
+; rotate_Xs
+; Rotate values of symbols X0...X3
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X_
+%endmacro
+
+; ROTATE_ARGS
+; Rotate values of symbols a...h
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endmacro
+
+%macro FOUR_ROUNDS_AND_SCHED 0
+	;; compute s0 four at a time and s1 two at a time
+	;; compute W[-16] + W[-7] 4 at a time
+	movdqa  XTMP0, X3
+	mov     y0, e 			; y0 = e
+	ror     y0, (25-11)             ; y0 = e >> (25-11)
+	mov     y1, a                   ; y1 = a
+	palignr XTMP0, X2, 4            ; XTMP0 = W[-7]
+	ror     y1, (22-13)             ; y1 = a >> (22-13)
+	xor     y0, e                   ; y0 = e ^ (e >> (25-11))
+	mov     y2, f                   ; y2 = f
+	ror     y0, (11-6)              ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	movdqa  XTMP1, X1
+	xor     y1, a                   ; y1 = a ^ (a >> (22-13)
+	xor     y2, g                   ; y2 = f^g
+	paddd   XTMP0, X0               ; XTMP0 = W[-7] + W[-16]
+	xor     y0, e                   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     y2, e                   ; y2 = (f^g)&e
+	ror     y1, (13-2)              ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	;; compute s0
+	palignr XTMP1, X0, 4            ; XTMP1 = W[-15]
+	xor     y1, a                   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	ror     y0, 6                   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     y2, g                   ; y2 = CH = ((f^g)&e)^g
+	movdqa  XTMP2, XTMP1            ; XTMP2 = W[-15]
+	ror     y1, 2                   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     y2, y0                  ; y2 = S1 + CH
+	add     y2 , [rsp + _XFER]      ; y2 = k + w + S1 + CH
+	movdqa  XTMP3, XTMP1            ; XTMP3 = W[-15]
+	mov     y0, a                   ; y0 = a
+	add     h, y2                   ; h = h + S1 + CH + k + w
+	mov     y2, a                   ; y2 = a
+	pslld   XTMP1, (32-7)           ;
+	or      y0, c                   ; y0 = a|c
+	add     d, h                    ; d = d + h + S1 + CH + k + w
+	and     y2, c                   ; y2 = a&c
+	psrld   XTMP2, 7                ;
+	and     y0, b                   ; y0 = (a|c)&b
+	add     h, y1                   ; h = h + S1 + CH + k + w + S0
+	por     XTMP1, XTMP2            ; XTMP1 = W[-15] ror 7
+	or      y0, y2                  ; y0 = MAJ = (a|c)&b)|(a&c)
+	add     h, y0                   ; h = h + S1 + CH + k + w + S0 + MAJ
+
+	ROTATE_ARGS
+	movdqa  XTMP2, XTMP3            ; XTMP2 = W[-15]
+	mov     y0, e                   ; y0 = e
+	mov     y1, a                   ; y1 = a
+	movdqa  XTMP4, XTMP3            ; XTMP4 = W[-15]
+	ror     y0, (25-11)             ; y0 = e >> (25-11)
+	xor     y0, e                   ; y0 = e ^ (e >> (25-11))
+	mov     y2, f                   ; y2 = f
+	ror     y1, (22-13)             ; y1 = a >> (22-13)
+	pslld   XTMP3, (32-18)          ;
+	xor     y1, a                   ; y1 = a ^ (a >> (22-13)
+	ror     y0, (11-6)              ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     y2, g                   ; y2 = f^g
+	psrld   XTMP2, 18               ;
+	ror     y1, (13-2)              ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     y0, e                   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     y2, e                   ; y2 = (f^g)&e
+	ror     y0, 6                   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	pxor    XTMP1, XTMP3
+	xor     y1, a                   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     y2, g                   ; y2 = CH = ((f^g)&e)^g
+	psrld   XTMP4, 3                ; XTMP4 = W[-15] >> 3
+	add     y2, y0                  ; y2 = S1 + CH
+	add     y2, [rsp + (1*4 + _XFER)] ; y2 = k + w + S1 + CH
+	ror     y1, 2                   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	pxor    XTMP1, XTMP2            ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
+	mov     y0, a                   ; y0 = a
+	add     h, y2                   ; h = h + S1 + CH + k + w
+	mov     y2, a                   ; y2 = a
+	pxor    XTMP1, XTMP4            ; XTMP1 = s0
+	or      y0, c                   ; y0 = a|c
+	add     d, h                    ; d = d + h + S1 + CH + k + w
+	and     y2, c                   ; y2 = a&c
+	;; compute low s1
+	pshufd  XTMP2, X3, 11111010B    ; XTMP2 = W[-2] {BBAA}
+	and     y0, b 			; y0 = (a|c)&b
+	add     h, y1                   ; h = h + S1 + CH + k + w + S0
+	paddd   XTMP0, XTMP1            ; XTMP0 = W[-16] + W[-7] + s0
+	or      y0, y2                  ; y0 = MAJ = (a|c)&b)|(a&c)
+	add     h, y0                   ; h = h + S1 + CH + k + w + S0 + MAJ
+
+	ROTATE_ARGS
+	movdqa  XTMP3, XTMP2            ; XTMP3 = W[-2] {BBAA}
+	mov     y0, e                   ; y0 = e
+	mov     y1, a                   ; y1 = a
+	ror     y0, (25-11)             ; y0 = e >> (25-11)
+	movdqa  XTMP4, XTMP2            ; XTMP4 = W[-2] {BBAA}
+	xor     y0, e                   ; y0 = e ^ (e >> (25-11))
+	ror     y1, (22-13)             ; y1 = a >> (22-13)
+	mov     y2, f                   ; y2 = f
+	xor     y1, a                   ; y1 = a ^ (a >> (22-13)
+	ror     y0, (11-6)              ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	psrlq   XTMP2, 17               ; XTMP2 = W[-2] ror 17 {xBxA}
+	xor     y2, g                   ; y2 = f^g
+	psrlq   XTMP3, 19               ; XTMP3 = W[-2] ror 19 {xBxA}
+	xor     y0, e                   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     y2, e                   ; y2 = (f^g)&e
+	psrld   XTMP4, 10               ; XTMP4 = W[-2] >> 10 {BBAA}
+	ror     y1, (13-2)              ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     y1, a                   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     y2, g                   ; y2 = CH = ((f^g)&e)^g
+	ror     y0, 6                   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	pxor    XTMP2, XTMP3
+	add     y2, y0                  ; y2 = S1 + CH
+	ror     y1, 2                   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     y2, [rsp + (2*4 + _XFER)] ; y2 = k + w + S1 + CH
+	pxor    XTMP4, XTMP2            ; XTMP4 = s1 {xBxA}
+	mov     y0, a                   ; y0 = a
+	add     h, y2                   ; h = h + S1 + CH + k + w
+	mov     y2, a                   ; y2 = a
+	pshufb  XTMP4, SHUF_00BA        ; XTMP4 = s1 {00BA}
+	or      y0, c                   ; y0 = a|c
+	add     d, h                    ; d = d + h + S1 + CH + k + w
+	and     y2, c                   ; y2 = a&c
+	paddd   XTMP0, XTMP4            ; XTMP0 = {..., ..., W[1], W[0]}
+	and     y0, b                   ; y0 = (a|c)&b
+	add     h, y1                   ; h = h + S1 + CH + k + w + S0
+	;; compute high s1
+	pshufd  XTMP2, XTMP0, 01010000B ; XTMP2 = W[-2] {BBAA}
+	or      y0, y2                  ; y0 = MAJ = (a|c)&b)|(a&c)
+	add     h, y0                   ; h = h + S1 + CH + k + w + S0 + MAJ
+
+	ROTATE_ARGS
+	movdqa  XTMP3, XTMP2            ; XTMP3 = W[-2] {DDCC}
+	mov     y0, e                   ; y0 = e
+	ror     y0, (25-11)             ; y0 = e >> (25-11)
+	mov     y1, a                   ; y1 = a
+	movdqa  X0, XTMP2               ; X0    = W[-2] {DDCC}
+	ror     y1, (22-13)             ; y1 = a >> (22-13)
+	xor     y0, e                   ; y0 = e ^ (e >> (25-11))
+	mov     y2, f                   ; y2 = f
+	ror     y0, (11-6)              ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	psrlq   XTMP2, 17               ; XTMP2 = W[-2] ror 17 {xDxC}
+	xor     y1, a                   ; y1 = a ^ (a >> (22-13)
+	xor     y2, g                   ; y2 = f^g
+	psrlq   XTMP3, 19               ; XTMP3 = W[-2] ror 19 {xDxC}
+	xor     y0, e                   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25
+	and     y2, e                   ; y2 = (f^g)&e
+	ror     y1, (13-2)              ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	psrld   X0, 10                  ; X0 = W[-2] >> 10 {DDCC}
+	xor     y1, a                   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22
+	ror     y0, 6                   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
+	xor     y2, g                   ; y2 = CH = ((f^g)&e)^g
+	pxor    XTMP2, XTMP3            ;
+	ror     y1, 2                   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
+	add     y2, y0                  ; y2 = S1 + CH
+	add     y2, [rsp + (3*4 + _XFER)] ; y2 = k + w + S1 + CH
+	pxor    X0, XTMP2               ; X0 = s1 {xDxC}
+	mov     y0, a                   ; y0 = a
+	add     h, y2                   ; h = h + S1 + CH + k + w
+	mov     y2, a                   ; y2 = a
+	pshufb  X0, SHUF_DC00           ; X0 = s1 {DC00}
+	or      y0, c                   ; y0 = a|c
+	add     d, h                    ; d = d + h + S1 + CH + k + w
+	and     y2, c                   ; y2 = a&c
+	paddd   X0, XTMP0               ; X0 = {W[3], W[2], W[1], W[0]}
+	and     y0, b                   ; y0 = (a|c)&b
+	add     h, y1                   ; h = h + S1 + CH + k + w + S0
+	or      y0, y2                  ; y0 = MAJ = (a|c)&b)|(a&c)
+	add     h, y0                   ; h = h + S1 + CH + k + w + S0 + MAJ
+
+	ROTATE_ARGS
+	rotate_Xs
+%endmacro
+
+;; input is [rsp + _XFER + %1 * 4]
+%macro DO_ROUND 1
+	mov     y0, e                 ; y0 = e
+	ror     y0, (25-11)           ; y0 = e >> (25-11)
+	mov     y1, a                 ; y1 = a
+	xor     y0, e                 ; y0 = e ^ (e >> (25-11))
+	ror     y1, (22-13)           ; y1 = a >> (22-13)
+	mov     y2, f                 ; y2 = f
+	xor     y1, a                 ; y1 = a ^ (a >> (22-13)
+	ror     y0, (11-6)            ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     y2, g                 ; y2 = f^g
+	xor     y0, e                 ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	ror     y1, (13-2)            ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+	and     y2, e                 ; y2 = (f^g)&e
+	xor     y1, a                 ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	ror     y0, 6                 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     y2, g                 ; y2 = CH = ((f^g)&e)^g
+	add     y2, y0                ; y2 = S1 + CH
+	ror     y1, 2                 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	%xdefine offset (%1 * 4 + _XFER)
+	add     y2, [rsp + offset]    ; y2 = k + w + S1 + CH
+	mov     y0, a                 ; y0 = a
+	add     h, y2                 ; h = h + S1 + CH + k + w
+	mov     y2, a                 ; y2 = a
+	or      y0, c                 ; y0 = a|c
+	add     d, h                  ; d = d + h + S1 + CH + k + w
+	and     y2, c                 ; y2 = a&c
+	and     y0, b                 ; y0 = (a|c)&b
+	add     h, y1                 ; h = h + S1 + CH + k + w + S0
+	or      y0, y2 		      ; y0 = MAJ = (a|c)&b)|(a&c)
+	add     h, y0 		      ; h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; void sha1_opt_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; 		 (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: all general regs, xmm0-xmm12
+;	{rbx, rdx, rbp, (rdi, rsi), r12~r15 are saved on stack}
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+section .text
+mk_global sha256_opt_x1, function, internal
+sha256_opt_x1:
+	endbranch
+	sub     rsp, STACK_SIZE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*1], rbp
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*2], rdi
+	mov     [rsp + _GPR_SAVE + 8*3], rsi
+	; caller has already stored XMM6~10
+%endif
+	mov     [rsp + _GPR_SAVE + 8*4], r12
+	mov     [rsp + _GPR_SAVE + 8*5], r13
+	mov     [rsp + _GPR_SAVE + 8*6], r14
+	mov     [rsp + _GPR_SAVE + 8*7], r15
+	mov     [rsp + _GPR_SAVE + 8*8], rdx
+
+	shl     NBLK, 6 		 ; convert to bytes
+	jz      done_hash
+
+	; detach idx from nlanx4
+	mov	IDX, NLANX4
+	shr	NLANX4, 8
+	and	IDX, 0xff
+
+	mov     [rsp + _TMGR], MGR
+	;; Load input pointers
+	mov     INP, [MGR + _data_ptr + IDX*8]
+	mov     [rsp + _INP], INP
+	;; nblk is used to indicate data end
+	add     NBLK, INP
+	mov     [rsp + _INP_END], NBLK  ; pointer to end of data
+
+
+	mov     TMGR, [rsp + _TMGR]
+	;; load initial digest
+	lea	TMP, [TMGR + 4*IDX]
+	mov     a, [TMP + 0*NLANX4]
+	mov     b, [TMP + 1*NLANX4]
+	mov     c, [TMP + 2*NLANX4]
+	lea	TMP, [TMP + 2*NLANX4]	; MGR + 4*IDX + 2*NLANX4
+	mov     d, [TMP + 1*NLANX4]
+	mov     e, [TMP + 2*NLANX4]
+	mov     g, [TMP + 4*NLANX4]
+	lea	TMP, [TMP + 1*NLANX4]	; MGR + 4*IDX + 3*NLANX4
+	mov     f, [TMP + 2*NLANX4]
+	mov     h, [TMP + 4*NLANX4]
+
+	movdqa  BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK]
+	movdqa  SHUF_00BA, [_SHUF_00BA]
+	movdqa  SHUF_DC00, [_SHUF_DC00]
+
+	mov     INP, [rsp + _INP]
+loop0:
+	lea     TBL, [K256]
+
+	;; byte swap first 16 dwords
+	COPY_XMM_AND_BSWAP      X0, [INP + 0*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X1, [INP + 1*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X2, [INP + 2*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X3, [INP + 3*16], BYTE_FLIP_MASK
+
+	mov     [rsp + _INP], INP
+
+	;; schedule 48 input dwords, by doing 3 rounds of 16 each
+	mov     SRND, 3
+
+loop1:
+	movdqa  XFER, [TBL]
+	paddd   XFER, X0
+	movdqa  [rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa  XFER, [TBL + 1*16]
+	paddd   XFER, X0
+	movdqa  [rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa  XFER, [TBL + 2*16]
+	paddd   XFER, X0
+	movdqa  [rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa  XFER, [TBL + 3*16]
+	paddd   XFER, X0
+	movdqa  [rsp + _XFER], XFER
+	add     TBL, 4*16
+	FOUR_ROUNDS_AND_SCHED
+
+	sub     SRND, 1
+	jne     loop1
+
+	mov     SRND, 2
+loop2:
+	paddd   X0, [TBL]
+	movdqa  [rsp + _XFER], X0
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+	paddd   X1, [TBL + 1*16]
+	movdqa  [rsp + _XFER], X1
+	add     TBL, 2*16
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+
+	movdqa  X0, X2
+	movdqa  X1, X3
+
+	sub     SRND, 1
+	jne     loop2
+
+	; write out digests
+	mov     TMGR, [rsp + _TMGR]
+	lea	TMP, [TMGR + 4*IDX]
+	addm    a, [TMP + 0*NLANX4]
+	addm    b, [TMP + 1*NLANX4]
+	addm    c, [TMP + 2*NLANX4]
+	lea	TMP, [TMP + 2*NLANX4]	; MGR + 4*IDX + 2*NLANX4
+	addm    d, [TMP + 1*NLANX4]
+	addm    e, [TMP + 2*NLANX4]
+	addm    g, [TMP + 4*NLANX4]
+	lea	TMP, [TMP + 1*NLANX4]	; MGR + 4*IDX + 3*NLANX4
+	addm    f, [TMP + 2*NLANX4]
+	addm    h, [TMP + 4*NLANX4]
+
+	mov     INP, [rsp + _INP]
+	add     INP, 64
+	cmp     INP, [rsp + _INP_END]
+	jne     loop0
+
+done_hash:
+	mov     MGR, [rsp + _TMGR]
+
+	mov     rdx, [rsp + _GPR_SAVE + 8*8]
+	mov     r15, [rsp + _GPR_SAVE + 8*7]
+	mov     r14, [rsp + _GPR_SAVE + 8*6]
+	mov     r13, [rsp + _GPR_SAVE + 8*5]
+	mov     r12, [rsp + _GPR_SAVE + 8*4]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rsi, [rsp + _GPR_SAVE + 8*3]
+	mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbp, [rsp + _GPR_SAVE + 8*1]
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	add     rsp, STACK_SIZE
+
+	ret
+
+section .data
+align 64
+K256:
+        DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+        DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+        DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+        DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+        DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+        DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+        DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+        DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+        DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+        DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+        DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+        DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+        DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+        DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+        DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+        DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+	DQ 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+; shuffle xBxA -> 00BA
+_SHUF_00BA:
+	DQ 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
+
+; shuffle xDxC -> DC00
+_SHUF_DC00:
+	DQ 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c
new file mode 100644
index 000000000..c3515dc52
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c
@@ -0,0 +1,204 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha256_mb.h"
+#include "endian_helper.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA256 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define H0 0x6a09e667
+#define H1 0xbb67ae85
+#define H2 0x3c6ef372
+#define H3 0xa54ff53a
+#define H4 0x510e527f
+#define H5 0x9b05688c
+#define H6 0x1f83d9ab
+#define H7 0x5be0cd19
+
+#define ror32(x, r) (((x)>>(r)) ^ ((x)<<(32-(r))))
+
+#define W(x) w[(x) & 15]
+
+#define S0(w) (ror32(w,7) ^ ror32(w,18) ^ (w >> 3))
+#define S1(w) (ror32(w,17) ^ ror32(w,19) ^ (w >> 10))
+
+#define s0(a) (ror32(a,2) ^ ror32(a,13) ^ ror32(a,22))
+#define s1(e) (ror32(e,6) ^ ror32(e,11) ^ ror32(e,25))
+#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c))
+#define ch(e,f,g) ((e & f) ^ (g & ~e))
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+	if (i<16) W(i) = to_be32(ww[i]); \
+	else \
+	W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+	t2 = s0(a) + maj(a,b,c); \
+	t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+	d += t1; \
+	h = t1 + t2;
+
+static void OPT_FIX sha256_single(const uint8_t * data, uint32_t digest[]);
+
+void sha256_ref(const uint8_t * input_data, uint32_t * digest, const uint32_t len)
+{
+	uint32_t i, j;
+	uint8_t buf[2 * SHA256_BLOCK_SIZE];
+
+	digest[0] = H0;
+	digest[1] = H1;
+	digest[2] = H2;
+	digest[3] = H3;
+	digest[4] = H4;
+	digest[5] = H5;
+	digest[6] = H6;
+	digest[7] = H7;
+
+	i = len;
+	while (i >= SHA256_BLOCK_SIZE) {
+		sha256_single(input_data, digest);
+		input_data += SHA256_BLOCK_SIZE;
+		i -= SHA256_BLOCK_SIZE;
+	}
+
+	memcpy(buf, input_data, i);
+	buf[i++] = 0x80;
+	for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - SHA256_PADLENGTHFIELD_SIZE); j++)
+		buf[j] = 0;
+
+	if (i > SHA256_BLOCK_SIZE - SHA256_PADLENGTHFIELD_SIZE)
+		i = 2 * SHA256_BLOCK_SIZE;
+	else
+		i = SHA256_BLOCK_SIZE;
+
+	*(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+	sha256_single(buf, digest);
+	if (i == 2 * SHA256_BLOCK_SIZE)
+		sha256_single(buf + SHA256_BLOCK_SIZE, digest);
+}
+
+void sha256_single(const uint8_t * data, uint32_t digest[])
+{
+	uint32_t a, b, c, d, e, f, g, h, t1, t2;
+	uint32_t w[16];
+	uint32_t *ww = (uint32_t *) data;
+
+	a = digest[0];
+	b = digest[1];
+	c = digest[2];
+	d = digest[3];
+	e = digest[4];
+	f = digest[5];
+	g = digest[6];
+	h = digest[7];
+
+	step(0, a, b, c, d, e, f, g, h, 0x428a2f98);
+	step(1, h, a, b, c, d, e, f, g, 0x71374491);
+	step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf);
+	step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5);
+	step(4, e, f, g, h, a, b, c, d, 0x3956c25b);
+	step(5, d, e, f, g, h, a, b, c, 0x59f111f1);
+	step(6, c, d, e, f, g, h, a, b, 0x923f82a4);
+	step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5);
+	step(8, a, b, c, d, e, f, g, h, 0xd807aa98);
+	step(9, h, a, b, c, d, e, f, g, 0x12835b01);
+	step(10, g, h, a, b, c, d, e, f, 0x243185be);
+	step(11, f, g, h, a, b, c, d, e, 0x550c7dc3);
+	step(12, e, f, g, h, a, b, c, d, 0x72be5d74);
+	step(13, d, e, f, g, h, a, b, c, 0x80deb1fe);
+	step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7);
+	step(15, b, c, d, e, f, g, h, a, 0xc19bf174);
+	step(16, a, b, c, d, e, f, g, h, 0xe49b69c1);
+	step(17, h, a, b, c, d, e, f, g, 0xefbe4786);
+	step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6);
+	step(19, f, g, h, a, b, c, d, e, 0x240ca1cc);
+	step(20, e, f, g, h, a, b, c, d, 0x2de92c6f);
+	step(21, d, e, f, g, h, a, b, c, 0x4a7484aa);
+	step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc);
+	step(23, b, c, d, e, f, g, h, a, 0x76f988da);
+	step(24, a, b, c, d, e, f, g, h, 0x983e5152);
+	step(25, h, a, b, c, d, e, f, g, 0xa831c66d);
+	step(26, g, h, a, b, c, d, e, f, 0xb00327c8);
+	step(27, f, g, h, a, b, c, d, e, 0xbf597fc7);
+	step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3);
+	step(29, d, e, f, g, h, a, b, c, 0xd5a79147);
+	step(30, c, d, e, f, g, h, a, b, 0x06ca6351);
+	step(31, b, c, d, e, f, g, h, a, 0x14292967);
+	step(32, a, b, c, d, e, f, g, h, 0x27b70a85);
+	step(33, h, a, b, c, d, e, f, g, 0x2e1b2138);
+	step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc);
+	step(35, f, g, h, a, b, c, d, e, 0x53380d13);
+	step(36, e, f, g, h, a, b, c, d, 0x650a7354);
+	step(37, d, e, f, g, h, a, b, c, 0x766a0abb);
+	step(38, c, d, e, f, g, h, a, b, 0x81c2c92e);
+	step(39, b, c, d, e, f, g, h, a, 0x92722c85);
+	step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1);
+	step(41, h, a, b, c, d, e, f, g, 0xa81a664b);
+	step(42, g, h, a, b, c, d, e, f, 0xc24b8b70);
+	step(43, f, g, h, a, b, c, d, e, 0xc76c51a3);
+	step(44, e, f, g, h, a, b, c, d, 0xd192e819);
+	step(45, d, e, f, g, h, a, b, c, 0xd6990624);
+	step(46, c, d, e, f, g, h, a, b, 0xf40e3585);
+	step(47, b, c, d, e, f, g, h, a, 0x106aa070);
+	step(48, a, b, c, d, e, f, g, h, 0x19a4c116);
+	step(49, h, a, b, c, d, e, f, g, 0x1e376c08);
+	step(50, g, h, a, b, c, d, e, f, 0x2748774c);
+	step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5);
+	step(52, e, f, g, h, a, b, c, d, 0x391c0cb3);
+	step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a);
+	step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f);
+	step(55, b, c, d, e, f, g, h, a, 0x682e6ff3);
+	step(56, a, b, c, d, e, f, g, h, 0x748f82ee);
+	step(57, h, a, b, c, d, e, f, g, 0x78a5636f);
+	step(58, g, h, a, b, c, d, e, f, 0x84c87814);
+	step(59, f, g, h, a, b, c, d, e, 0x8cc70208);
+	step(60, e, f, g, h, a, b, c, d, 0x90befffa);
+	step(61, d, e, f, g, h, a, b, c, 0xa4506ceb);
+	step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7);
+	step(63, b, c, d, e, f, g, h, a, 0xc67178f2);
+
+	digest[0] += a;
+	digest[1] += b;
+	digest[2] += c;
+	digest[3] += d;
+	digest[4] += e;
+	digest[5] += f;
+	digest[6] += g;
+	digest[7] += h;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am
new file mode 100644
index 000000000..4ba7d1049
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am
@@ -0,0 +1,108 @@
+########################################################################
+#  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_x86_64 += 	sha512_mb/sha512_ctx_sse.c \
+		sha512_mb/sha512_ctx_avx.c \
+		sha512_mb/sha512_ctx_avx2.c \
+		sha512_mb/sha512_ctx_sb_sse4.c \
+		sha512_mb/sha512_ctx_base.c
+
+lsrc_x86_64 += 	sha512_mb/sha512_mb_mgr_init_sse.c \
+		sha512_mb/sha512_mb_mgr_init_avx2.c \
+		sha512_mb/sha512_sb_mgr_init_sse4.c
+
+lsrc_x86_32 += 	$(lsrc_x86_64)
+
+lsrc_x86_64 += 	sha512_mb/sha512_mb_mgr_submit_sse.asm \
+		sha512_mb/sha512_mb_mgr_submit_avx.asm \
+		sha512_mb/sha512_mb_mgr_submit_avx2.asm \
+		sha512_mb/sha512_mb_mgr_flush_sse.asm \
+		sha512_mb/sha512_mb_mgr_flush_avx.asm \
+		sha512_mb/sha512_mb_mgr_flush_avx2.asm \
+		sha512_mb/sha512_mb_x2_sse.asm \
+		sha512_mb/sha512_mb_x2_avx.asm \
+		sha512_mb/sha512_mb_x4_avx2.asm \
+		sha512_mb/sha512_multibinary.asm \
+		sha512_mb/sha512_sb_mgr_submit_sse4.c \
+		sha512_mb/sha512_sb_mgr_flush_sse4.c \
+		sha512_mb/sha512_sse4.asm
+
+lsrc_x86_64 += 	sha512_mb/sha512_ctx_avx512.c \
+		sha512_mb/sha512_mb_mgr_init_avx512.c \
+		sha512_mb/sha512_mb_mgr_submit_avx512.asm \
+		sha512_mb/sha512_mb_mgr_flush_avx512.asm \
+		sha512_mb/sha512_mb_x8_avx512.asm
+
+lsrc_x86_32 += 	$(lsrc_x86_64)
+
+lsrc_aarch64 += sha512_mb/sha512_ctx_base.c			\
+		sha512_mb/aarch64/sha512_mb_multibinary.S 	\
+		sha512_mb/aarch64/sha512_mb_aarch64_dispatcher.c  \
+		sha512_mb/aarch64/sha512_ctx_ce.c			\
+		sha512_mb/aarch64/sha512_mb_mgr_ce.c	\
+		sha512_mb/aarch64/sha512_mb_x1_ce.S	\
+		sha512_mb/aarch64/sha512_mb_x2_ce.S
+
+lsrc_base_aliases += sha512_mb/sha512_ctx_base.c	\
+		sha512_mb/sha512_ctx_base_aliases.c
+
+src_include += -I $(srcdir)/sha512_mb
+
+extern_hdrs +=  include/sha512_mb.h \
+		include/multi_buffer.h
+
+other_src += 	include/datastruct.asm \
+		sha512_mb/sha512_job.asm \
+		sha512_mb/sha512_mb_mgr_datastruct.asm \
+		include/reg_sizes.asm \
+		sha512_mb/sha512_ref.c \
+		include/memcpy_inline.h \
+		include/memcpy.asm \
+		include/intrinreg.h
+
+check_tests +=	sha512_mb/sha512_mb_test \
+		sha512_mb/sha512_mb_rand_test \
+		sha512_mb/sha512_mb_rand_update_test
+
+unit_tests   += sha512_mb/sha512_mb_rand_ssl_test
+
+perf_tests +=   sha512_mb/sha512_mb_vs_ossl_perf
+
+sha512_mb_rand_test: sha512_ref.o
+sha512_mb_sha512_mb_rand_test_LDADD = sha512_mb/sha512_ref.lo libisal_crypto.la
+
+sha512_mb_rand_update_test: sha512_ref.o
+sha512_mb_sha512_mb_rand_update_test_LDADD = sha512_mb/sha512_ref.lo libisal_crypto.la
+
+sha512_mb_rand_ssl_test: LDLIBS += -lcrypto
+sha512_mb_sha512_mb_rand_ssl_test_LDFLAGS = -lcrypto
+
+sha512_mb_vs_ossl_perf: LDLIBS += -lcrypto
+sha512_mb_sha512_mb_vs_ossl_perf_LDFLAGS = -lcrypto
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_ctx_ce.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_ctx_ce.c
new file mode 100644
index 000000000..02f04197b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_ctx_ce.c
@@ -0,0 +1,256 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+void sha512_mb_mgr_init_ce(SHA512_MB_JOB_MGR * state);
+SHA512_JOB *sha512_mb_mgr_submit_ce(SHA512_MB_JOB_MGR * state, SHA512_JOB * job);
+SHA512_JOB *sha512_mb_mgr_flush_ce(SHA512_MB_JOB_MGR * state);
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_ce(SHA512_HASH_CTX_MGR * mgr)
+{
+	sha512_mb_mgr_init_ce(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_ce(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+					  const void *buffer, uint32_t len,
+					  HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_fixedlen(&ctx->partial_block_buffer
+					[ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx =
+			    (SHA512_HASH_CTX *) sha512_mb_mgr_submit_ce(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_ce(SHA512_HASH_CTX_MGR * mgr)
+{
+	SHA512_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_ce(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx)
+{
+	while (ctx) {
+
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_fixedlen(ctx->partial_block_buffer,
+						((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA512_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA512_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_ce(&mgr->mgr,
+										  &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx =
+			    (SHA512_HASH_CTX *) sha512_mb_mgr_submit_ce(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+	static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+	    { SHA512_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA512_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA512_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_ce_slver_02020142;
+struct slver sha512_ctx_mgr_init_ce_slver = { 0x0142, 0x02, 0x02 };
+
+struct slver sha512_ctx_mgr_submit_ce_slver_02020143;
+struct slver sha512_ctx_mgr_submit_ce_slver = { 0x0143, 0x02, 0x02 };
+
+struct slver sha512_ctx_mgr_flush_ce_slver_02020144;
+struct slver sha512_ctx_mgr_flush_ce_slver = { 0x0144, 0x02, 0x02 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_aarch64_dispatcher.c
new file mode 100644
index 000000000..321e8507d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_aarch64_dispatcher.c
@@ -0,0 +1,59 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(sha512_ctx_mgr_submit)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SHA3)
+		return PROVIDER_INFO(sha512_ctx_mgr_submit_ce);
+
+	return PROVIDER_BASIC(sha512_ctx_mgr_submit);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sha512_ctx_mgr_init)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SHA3)
+		return PROVIDER_INFO(sha512_ctx_mgr_init_ce);
+
+	return PROVIDER_BASIC(sha512_ctx_mgr_init);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sha512_ctx_mgr_flush)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SHA3)
+		return PROVIDER_INFO(sha512_ctx_mgr_flush_ce);
+
+	return PROVIDER_BASIC(sha512_ctx_mgr_flush);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_mgr_ce.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_mgr_ce.c
new file mode 100644
index 000000000..43801c3d6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_mgr_ce.c
@@ -0,0 +1,210 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <sha512_mb.h>
+#include <assert.h>
+
+#ifndef max
+#define max(a,b)            (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b)            (((a) < (b)) ? (a) : (b))
+#endif
+#ifndef SHA512_MB_CE_MAX_LANES
+#define SHA512_MB_CE_MAX_LANES	2
+#endif
+
+#if SHA512_MB_CE_MAX_LANES >=2
+void sha512_mb_ce_x2(SHA512_JOB *, SHA512_JOB *, int);
+#endif
+void sha512_mb_ce_x1(SHA512_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i)  	\
+	(((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i)  	\
+	(((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define	LANE_IS_FREE(state,i)		\
+	(((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i)	\
+	(((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+void sha512_mb_mgr_init_ce(SHA512_MB_JOB_MGR * state)
+{
+	int i;
+	//~ state->unused_lanes = 0xf3210;
+	state->unused_lanes = 0xf;
+	state->num_lanes_inuse = 0;
+	for (i = SHA512_MB_CE_MAX_LANES - 1; i >= 0; i--) {
+		state->unused_lanes <<= 4;
+		state->unused_lanes |= i;
+		state->lens[i] = i;
+		state->ldata[i].job_in_lane = 0;
+	}
+
+	//lanes > SHA1_MB_CE_MAX_LANES is invalid lane
+	for (i = SHA512_MB_CE_MAX_LANES; i < SHA512_MAX_LANES; i++) {
+		state->lens[i] = 0xf;
+		state->ldata[i].job_in_lane = 0;
+	}
+}
+
+static int sha512_mb_mgr_do_jobs(SHA512_MB_JOB_MGR * state)
+{
+	int lane_idx, len, i, lanes;
+
+	int lane_idx_array[SHA512_MAX_LANES];
+
+	if (state->num_lanes_inuse == 0) {
+		return -1;
+	}
+#if	SHA512_MB_CE_MAX_LANES == 2
+	if (state->num_lanes_inuse == 2) {
+		len = min(state->lens[0], state->lens[1]);
+		lane_idx = len & 0xf;
+		len &= ~0xf;
+
+		sha512_mb_ce_x2(state->ldata[0].job_in_lane,
+				state->ldata[1].job_in_lane, len >> 4);
+
+	} else
+#endif
+	{
+		lanes = 0, len = 0;
+		for (i = 0; i < SHA512_MAX_LANES && lanes < state->num_lanes_inuse; i++) {
+			if (LANE_IS_NOT_FINISHED(state, i)) {
+				if (lanes)
+					len = min(len, state->lens[i]);
+				else
+					len = state->lens[i];
+				lane_idx_array[lanes] = i;
+				lanes++;
+			}
+		}
+		if (lanes == 0)
+			return -1;
+		lane_idx = len & 0xf;
+		len = len & (~0xf);
+
+#if SHA512_MB_CE_MAX_LANES >=2
+		if (lanes == 2) {
+			sha512_mb_ce_x2(state->ldata[lane_idx_array[0]].job_in_lane,
+					state->ldata[lane_idx_array[1]].job_in_lane, len >> 4);
+		} else
+#endif
+		{
+			sha512_mb_ce_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4);
+		}
+	}
+	//only return the min length job
+	for (i = 0; i < SHA512_MAX_LANES; i++) {
+		if (LANE_IS_NOT_FINISHED(state, i)) {
+			state->lens[i] -= len;
+			state->ldata[i].job_in_lane->len -= len;
+			state->ldata[i].job_in_lane->buffer += len << 3;
+		}
+	}
+
+	return lane_idx;
+
+}
+
+static SHA512_JOB *sha512_mb_mgr_free_lane(SHA512_MB_JOB_MGR * state)
+{
+	int i;
+	SHA512_JOB *ret = NULL;
+
+	for (i = 0; i < SHA512_MB_CE_MAX_LANES; i++) {
+		if (LANE_IS_FINISHED(state, i)) {
+
+			state->unused_lanes <<= 4;
+			state->unused_lanes |= i;
+			state->num_lanes_inuse--;
+			ret = state->ldata[i].job_in_lane;
+			ret->status = STS_COMPLETED;
+			state->ldata[i].job_in_lane = NULL;
+			break;
+		}
+	}
+	return ret;
+}
+
+static void sha512_mb_mgr_insert_job(SHA512_MB_JOB_MGR * state, SHA512_JOB * job)
+{
+	int lane_idx;
+	//add job into lanes
+	lane_idx = state->unused_lanes & 0xf;
+	//fatal error
+	assert(lane_idx < SHA512_MB_CE_MAX_LANES);
+	state->lens[lane_idx] = (job->len << 4) | lane_idx;
+	state->ldata[lane_idx].job_in_lane = job;
+	state->unused_lanes >>= 4;
+	state->num_lanes_inuse++;
+}
+
+SHA512_JOB *sha512_mb_mgr_submit_ce(SHA512_MB_JOB_MGR * state, SHA512_JOB * job)
+{
+#ifndef NDEBUG
+	int lane_idx;
+#endif
+	SHA512_JOB *ret;
+
+	//add job into lanes
+	sha512_mb_mgr_insert_job(state, job);
+
+	ret = sha512_mb_mgr_free_lane(state);
+	if (ret != NULL) {
+		return ret;
+	}
+	//submit will wait all lane has data
+	if (state->num_lanes_inuse < SHA512_MB_CE_MAX_LANES)
+		return NULL;
+#ifndef NDEBUG
+	lane_idx = sha512_mb_mgr_do_jobs(state);
+	assert(lane_idx != -1);
+#else
+	sha512_mb_mgr_do_jobs(state);
+#endif
+
+	//~ i = lane_idx;
+	ret = sha512_mb_mgr_free_lane(state);
+	return ret;
+}
+
+SHA512_JOB *sha512_mb_mgr_flush_ce(SHA512_MB_JOB_MGR * state)
+{
+	SHA512_JOB *ret;
+	ret = sha512_mb_mgr_free_lane(state);
+	if (ret) {
+		return ret;
+	}
+
+	sha512_mb_mgr_do_jobs(state);
+	return sha512_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_multibinary.S b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_multibinary.S
new file mode 100644
index 000000000..58bf13478
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_multibinary.S
@@ -0,0 +1,36 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include <aarch64_multibinary.h>
+
+
+mbin_interface sha512_ctx_mgr_submit
+mbin_interface sha512_ctx_mgr_init
+mbin_interface sha512_ctx_mgr_flush
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x1_ce.S b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x1_ce.S
new file mode 100644
index 000000000..ab5d0aed7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x1_ce.S
@@ -0,0 +1,269 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8.2-a+crypto+sha3
+	.text
+	.align	2
+	.p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro	declare_var_vector_reg name:req,reg:req
+	\name\()_q	.req	q\reg
+	\name\()_v	.req	v\reg
+	\name\()_s	.req	s\reg
+.endm
+/**
+maros for round 0-63
+*/
+.macro sha512_rounds_low ab,cd,ef,gh,tmp,msg0,msg1,msg4,msg5,msg7
+	ldr		key_q , [key_adr]
+	add		l0_tmp0_v.2d,l0_\msg0\()_v.2d,key_v.2d
+	add		key_adr,key_adr,16
+	ext		l0_tmp1_v.16b,l0_\ef\()_v.16b,l0_\gh\()_v.16b,#8
+	ext		l0_tmp0_v.16b,l0_tmp0_v.16b,l0_tmp0_v.16b,#8
+	ext		l0_tmp2_v.16b,l0_\cd\()_v.16b,l0_\ef\()_v.16b,#8
+	add		l0_\gh\()_v.2d,l0_\gh\()_v.2d,l0_tmp0_v.2d
+	ext		l0_tmp0_v.16b,l0_\msg4\()_v.16b,l0_\msg5\()_v.16b,#8
+	sha512su0	l0_\msg0\()_v.2d,l0_\msg1\()_v.2d
+	sha512h		l0_\gh\()_q,l0_tmp1_q,l0_tmp2_v.2d
+	sha512su1	l0_\msg0\()_v.2d,l0_\msg7\()_v.2d,l0_tmp0_v.2d
+	add		l0_\tmp\()_v.2d,l0_\cd\()_v.2d,l0_\gh\()_v.2d
+	sha512h2	l0_\gh\()_q,l0_\cd\()_q,l0_\ab\()_v.2d
+.endm
+/**
+maros for round 64-79
+*/
+.macro sha512_rounds_high	ab,cd,ef,gh,tmp,msg0
+	ldr		key_q , [key_adr]
+	add		l0_tmp0_v.2d,l0_\msg0\()_v.2d,key_v.2d
+	add		key_adr,key_adr,16
+	ext		l0_tmp1_v.16b,l0_\ef\()_v.16b,l0_\gh\()_v.16b,#8
+	ext		l0_tmp0_v.16b,l0_tmp0_v.16b,l0_tmp0_v.16b,#8
+	ext		l0_tmp2_v.16b,l0_\cd\()_v.16b,l0_\ef\()_v.16b,#8
+	add		l0_\gh\()_v.2d,l0_\gh\()_v.2d,l0_tmp0_v.2d
+	sha512h		l0_\gh\()_q,l0_tmp1_q,l0_tmp2_v.2d
+	add		l0_\tmp\()_v.2d,l0_\cd\()_v.2d,l0_\gh\()_v.2d
+	sha512h2	l0_\gh\()_q,l0_\cd\()_q,l0_\ab\()_v.2d
+.endm
+
+
+/*
+Variable list
+*/
+
+	declare_var_vector_reg	key,31
+
+
+/*
+digest variables
+*/
+	declare_var_vector_reg	l0_ab,0
+	declare_var_vector_reg	l0_cd,1
+	declare_var_vector_reg	l0_ef,2
+	declare_var_vector_reg	l0_gh,3
+
+	declare_var_vector_reg	l0_tmp,4
+	declare_var_vector_reg	l0_ab_saved,24
+	declare_var_vector_reg	l0_cd_saved,25
+	declare_var_vector_reg	l0_ef_saved,26
+	declare_var_vector_reg	l0_gh_saved,27
+/*
+Temporay variables
+*/
+	declare_var_vector_reg	l0_tmp0,5
+	declare_var_vector_reg	l0_tmp1,6
+	declare_var_vector_reg	l0_tmp2,7
+
+/*
+Message variables
+*/
+	declare_var_vector_reg	l0_msg0,16
+	declare_var_vector_reg	l0_msg1,17
+	declare_var_vector_reg	l0_msg2,18
+	declare_var_vector_reg	l0_msg3,19
+	declare_var_vector_reg	l0_msg4,20
+	declare_var_vector_reg	l0_msg5,21
+	declare_var_vector_reg	l0_msg6,22
+	declare_var_vector_reg	l0_msg7,23
+
+
+
+/*
+	void sha512_mb_ce_x1(SHA1_JOB * l0_job, int len);
+*/
+/*
+Arguements list
+*/
+	l0_job 	.req	x0
+	len	.req	w1
+	l0_data	.req	x2
+	key_adr	.req	x3
+	.global	sha512_mb_ce_x1
+	.type	sha512_mb_ce_x1, %function
+sha512_mb_ce_x1:
+	ldr	l0_data, [l0_job]
+	// load initial digest
+	add	x4,l0_job,64
+	ld1	{l0_ab_v.4s-l0_gh_v.4s},[x4]
+
+
+
+start_loop:
+	adr	key_adr, KEY
+	//load msgs
+	ld1	{l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+	add	l0_data,l0_data,64
+	ld1	{l0_msg4_v.4s-l0_msg7_v.4s},[l0_data]
+	add	l0_data,l0_data,64
+	//adjust loop parameter
+
+	sub	len, len, #1
+	cmp	len, 0
+
+	//save state
+	mov	l0_ab_saved_v.16b,l0_ab_v.16b
+	mov	l0_cd_saved_v.16b,l0_cd_v.16b
+	mov	l0_ef_saved_v.16b,l0_ef_v.16b
+	mov	l0_gh_saved_v.16b,l0_gh_v.16b
+
+	//rev endian
+	rev64	l0_msg0_v.16b,l0_msg0_v.16b
+	rev64	l0_msg1_v.16b,l0_msg1_v.16b
+	rev64	l0_msg2_v.16b,l0_msg2_v.16b
+	rev64	l0_msg3_v.16b,l0_msg3_v.16b
+	rev64	l0_msg4_v.16b,l0_msg4_v.16b
+	rev64	l0_msg5_v.16b,l0_msg5_v.16b
+	rev64	l0_msg6_v.16b,l0_msg6_v.16b
+	rev64	l0_msg7_v.16b,l0_msg7_v.16b
+
+
+
+	sha512_rounds_low	 ab, cd, ef, gh,tmp,msg0,msg1,msg4,msg5,msg7	/* rounds  0- 1 */
+	sha512_rounds_low	 gh, ab,tmp, ef, cd,msg1,msg2,msg5,msg6,msg0	/* rounds  2- 3 */
+	sha512_rounds_low	 ef, gh, cd,tmp, ab,msg2,msg3,msg6,msg7,msg1	/* rounds  4- 5 */
+	sha512_rounds_low	tmp, ef, ab, cd, gh,msg3,msg4,msg7,msg0,msg2	/* rounds  6- 7 */
+	sha512_rounds_low	 cd,tmp, gh, ab, ef,msg4,msg5,msg0,msg1,msg3	/* rounds  8- 9 */
+	sha512_rounds_low	 ab, cd, ef, gh,tmp,msg5,msg6,msg1,msg2,msg4	/* rounds 10-11 */
+	sha512_rounds_low	 gh, ab,tmp, ef, cd,msg6,msg7,msg2,msg3,msg5	/* rounds 12-13 */
+	sha512_rounds_low	 ef, gh, cd,tmp, ab,msg7,msg0,msg3,msg4,msg6	/* rounds 14-15 */
+	sha512_rounds_low	tmp, ef, ab, cd, gh,msg0,msg1,msg4,msg5,msg7	/* rounds 16-17 */
+	sha512_rounds_low	 cd,tmp, gh, ab, ef,msg1,msg2,msg5,msg6,msg0	/* rounds 18-19 */
+	sha512_rounds_low	 ab, cd, ef, gh,tmp,msg2,msg3,msg6,msg7,msg1	/* rounds 20-21 */
+	sha512_rounds_low	 gh, ab,tmp, ef, cd,msg3,msg4,msg7,msg0,msg2	/* rounds 22-23 */
+	sha512_rounds_low	 ef, gh, cd,tmp, ab,msg4,msg5,msg0,msg1,msg3	/* rounds 24-25 */
+	sha512_rounds_low	tmp, ef, ab, cd, gh,msg5,msg6,msg1,msg2,msg4	/* rounds 26-27 */
+	sha512_rounds_low	 cd,tmp, gh, ab, ef,msg6,msg7,msg2,msg3,msg5	/* rounds 28-29 */
+	sha512_rounds_low	 ab, cd, ef, gh,tmp,msg7,msg0,msg3,msg4,msg6	/* rounds 30-31 */
+	sha512_rounds_low	 gh, ab,tmp, ef, cd,msg0,msg1,msg4,msg5,msg7	/* rounds 32-33 */
+	sha512_rounds_low	 ef, gh, cd,tmp, ab,msg1,msg2,msg5,msg6,msg0	/* rounds 34-35 */
+	sha512_rounds_low	tmp, ef, ab, cd, gh,msg2,msg3,msg6,msg7,msg1	/* rounds 36-37 */
+	sha512_rounds_low	 cd,tmp, gh, ab, ef,msg3,msg4,msg7,msg0,msg2	/* rounds 38-39 */
+	sha512_rounds_low	 ab, cd, ef, gh,tmp,msg4,msg5,msg0,msg1,msg3	/* rounds 40-41 */
+	sha512_rounds_low	 gh, ab,tmp, ef, cd,msg5,msg6,msg1,msg2,msg4	/* rounds 42-43 */
+	sha512_rounds_low	 ef, gh, cd,tmp, ab,msg6,msg7,msg2,msg3,msg5	/* rounds 44-45 */
+	sha512_rounds_low	tmp, ef, ab, cd, gh,msg7,msg0,msg3,msg4,msg6	/* rounds 46-47 */
+	sha512_rounds_low	 cd,tmp, gh, ab, ef,msg0,msg1,msg4,msg5,msg7	/* rounds 48-49 */
+	sha512_rounds_low	 ab, cd, ef, gh,tmp,msg1,msg2,msg5,msg6,msg0	/* rounds 50-51 */
+	sha512_rounds_low	 gh, ab,tmp, ef, cd,msg2,msg3,msg6,msg7,msg1	/* rounds 52-53 */
+	sha512_rounds_low	 ef, gh, cd,tmp, ab,msg3,msg4,msg7,msg0,msg2	/* rounds 54-55 */
+	sha512_rounds_low	tmp, ef, ab, cd, gh,msg4,msg5,msg0,msg1,msg3	/* rounds 56-57 */
+	sha512_rounds_low	 cd,tmp, gh, ab, ef,msg5,msg6,msg1,msg2,msg4	/* rounds 58-59 */
+	sha512_rounds_low	 ab, cd, ef, gh,tmp,msg6,msg7,msg2,msg3,msg5	/* rounds 60-61 */
+	sha512_rounds_low	 gh, ab,tmp, ef, cd,msg7,msg0,msg3,msg4,msg6	/* rounds 62-63 */
+	sha512_rounds_high	 ef, gh, cd,tmp, ab,msg0			/* rounds 64-65 */
+	sha512_rounds_high	tmp, ef, ab, cd, gh,msg1			/* rounds 66-67 */
+	sha512_rounds_high	 cd,tmp, gh, ab, ef,msg2			/* rounds 68-69 */
+	sha512_rounds_high	 ab, cd, ef, gh,tmp,msg3			/* rounds 70-71 */
+	sha512_rounds_high	 gh, ab,tmp, ef, cd,msg4			/* rounds 72-73 */
+	sha512_rounds_high	 ef, gh, cd,tmp, ab,msg5			/* rounds 74-75 */
+	sha512_rounds_high	tmp, ef, ab, cd, gh,msg6			/* rounds 76-77 */
+	sha512_rounds_high	 cd,tmp, gh, ab, ef,msg7			/* rounds 78-79 */
+
+
+
+	add	l0_ab_v.2d,l0_ab_v.2d,l0_ab_saved_v.2d
+	add	l0_cd_v.2d,l0_cd_v.2d,l0_cd_saved_v.2d
+	add	l0_ef_v.2d,l0_ef_v.2d,l0_ef_saved_v.2d
+	add	l0_gh_v.2d,l0_gh_v.2d,l0_gh_saved_v.2d
+
+
+	bgt	start_loop
+
+	add	x4,l0_job,64
+	st1	{l0_ab_v.4s-l0_gh_v.4s},[x4]
+
+
+	ret
+
+	.size	sha512_mb_ce_x1, .-sha512_mb_ce_x1
+	.section	.rol0_data.cst16,"aM",@progbits,16
+	.align	4
+KEY:
+	.quad		0x428a2f98d728ae22, 0x7137449123ef65cd
+	.quad		0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+	.quad		0x3956c25bf348b538, 0x59f111f1b605d019
+	.quad		0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+	.quad		0xd807aa98a3030242, 0x12835b0145706fbe
+	.quad		0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+	.quad		0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+	.quad		0x9bdc06a725c71235, 0xc19bf174cf692694
+	.quad		0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+	.quad		0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+	.quad		0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+	.quad		0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+	.quad		0x983e5152ee66dfab, 0xa831c66d2db43210
+	.quad		0xb00327c898fb213f, 0xbf597fc7beef0ee4
+	.quad		0xc6e00bf33da88fc2, 0xd5a79147930aa725
+	.quad		0x06ca6351e003826f, 0x142929670a0e6e70
+	.quad		0x27b70a8546d22ffc, 0x2e1b21385c26c926
+	.quad		0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+	.quad		0x650a73548baf63de, 0x766a0abb3c77b2a8
+	.quad		0x81c2c92e47edaee6, 0x92722c851482353b
+	.quad		0xa2bfe8a14cf10364, 0xa81a664bbc423001
+	.quad		0xc24b8b70d0f89791, 0xc76c51a30654be30
+	.quad		0xd192e819d6ef5218, 0xd69906245565a910
+	.quad		0xf40e35855771202a, 0x106aa07032bbd1b8
+	.quad		0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+	.quad		0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+	.quad		0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+	.quad		0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+	.quad		0x748f82ee5defb2fc, 0x78a5636f43172f60
+	.quad		0x84c87814a1f0ab72, 0x8cc702081a6439ec
+	.quad		0x90befffa23631e28, 0xa4506cebde82bde9
+	.quad		0xbef9a3f7b2c67915, 0xc67178f2e372532b
+	.quad		0xca273eceea26619c, 0xd186b8c721c0c207
+	.quad		0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+	.quad		0x06f067aa72176fba, 0x0a637dc5a2c898a6
+	.quad		0x113f9804bef90dae, 0x1b710b35131c471b
+	.quad		0x28db77f523047d84, 0x32caab7b40c72493
+	.quad		0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+	.quad		0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+	.quad		0x5fcb6fab3ad6faec, 0x6c44198c4a475817
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x2_ce.S b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x2_ce.S
new file mode 100644
index 000000000..7864eb028
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x2_ce.S
@@ -0,0 +1,390 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8.2-a+crypto+sha3
+	.text
+	.align	2
+	.p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro	declare_var_vector_reg name:req,reg:req
+	\name\()_q	.req	q\reg
+	\name\()_v	.req	v\reg
+	\name\()_s	.req	s\reg
+.endm
+/**
+maros for round 0-63
+*/
+.macro sha512_rounds_low ab,cd,ef,gh,tmp,msg0,msg1,msg4,msg5,msg7
+	ldr		key_q , [key_adr]
+	add		l0_tmp0_v.2d,l0_\msg0\()_v.2d,key_v.2d
+	add		l1_tmp0_v.2d,l1_\msg0\()_v.2d,key_v.2d
+	add		key_adr,key_adr,16
+
+
+	ext		l0_tmp1_v.16b,l0_\ef\()_v.16b,l0_\gh\()_v.16b,#8
+	ext		l1_tmp1_v.16b,l1_\ef\()_v.16b,l1_\gh\()_v.16b,#8
+
+
+	ext		l0_tmp0_v.16b,l0_tmp0_v.16b,l0_tmp0_v.16b,#8
+	ext		l1_tmp0_v.16b,l1_tmp0_v.16b,l1_tmp0_v.16b,#8
+
+
+	ext		l0_tmp2_v.16b,l0_\cd\()_v.16b,l0_\ef\()_v.16b,#8
+	ext		l1_tmp2_v.16b,l1_\cd\()_v.16b,l1_\ef\()_v.16b,#8
+
+
+	add		l0_\gh\()_v.2d,l0_\gh\()_v.2d,l0_tmp0_v.2d
+	add		l1_\gh\()_v.2d,l1_\gh\()_v.2d,l1_tmp0_v.2d
+
+
+	ext		l0_tmp0_v.16b,l0_\msg4\()_v.16b,l0_\msg5\()_v.16b,#8
+	ext		l1_tmp0_v.16b,l1_\msg4\()_v.16b,l1_\msg5\()_v.16b,#8
+
+	sha512su0	l0_\msg0\()_v.2d,l0_\msg1\()_v.2d
+	sha512su0	l1_\msg0\()_v.2d,l1_\msg1\()_v.2d
+
+	sha512h		l0_\gh\()_q,l0_tmp1_q,l0_tmp2_v.2d
+	sha512h		l1_\gh\()_q,l1_tmp1_q,l1_tmp2_v.2d
+
+	sha512su1	l0_\msg0\()_v.2d,l0_\msg7\()_v.2d,l0_tmp0_v.2d
+	sha512su1	l1_\msg0\()_v.2d,l1_\msg7\()_v.2d,l1_tmp0_v.2d
+
+	add		l0_\tmp\()_v.2d,l0_\cd\()_v.2d,l0_\gh\()_v.2d
+	add		l1_\tmp\()_v.2d,l1_\cd\()_v.2d,l1_\gh\()_v.2d
+
+	sha512h2	l0_\gh\()_q,l0_\cd\()_q,l0_\ab\()_v.2d
+	sha512h2	l1_\gh\()_q,l1_\cd\()_q,l1_\ab\()_v.2d
+.endm
+
+/**
+maros for round 64-79
+*/
+.macro sha512_rounds_high	ab,cd,ef,gh,tmp,msg0
+	ldr		key_q , [key_adr]
+	add		l0_tmp0_v.2d,l0_\msg0\()_v.2d,key_v.2d
+	add		l1_tmp0_v.2d,l1_\msg0\()_v.2d,key_v.2d
+	add		key_adr,key_adr,16
+
+
+	ext		l0_tmp1_v.16b,l0_\ef\()_v.16b,l0_\gh\()_v.16b,#8
+	ext		l1_tmp1_v.16b,l1_\ef\()_v.16b,l1_\gh\()_v.16b,#8
+
+
+	ext		l0_tmp0_v.16b,l0_tmp0_v.16b,l0_tmp0_v.16b,#8
+	ext		l1_tmp0_v.16b,l1_tmp0_v.16b,l1_tmp0_v.16b,#8
+
+
+	ext		l0_tmp2_v.16b,l0_\cd\()_v.16b,l0_\ef\()_v.16b,#8
+	ext		l1_tmp2_v.16b,l1_\cd\()_v.16b,l1_\ef\()_v.16b,#8
+
+
+	add		l0_\gh\()_v.2d,l0_\gh\()_v.2d,l0_tmp0_v.2d
+	add		l1_\gh\()_v.2d,l1_\gh\()_v.2d,l1_tmp0_v.2d
+
+
+
+	sha512h		l0_\gh\()_q,l0_tmp1_q,l0_tmp2_v.2d
+	sha512h		l1_\gh\()_q,l1_tmp1_q,l1_tmp2_v.2d
+
+
+	add		l0_\tmp\()_v.2d,l0_\cd\()_v.2d,l0_\gh\()_v.2d
+	add		l1_\tmp\()_v.2d,l1_\cd\()_v.2d,l1_\gh\()_v.2d
+
+	sha512h2	l0_\gh\()_q,l0_\cd\()_q,l0_\ab\()_v.2d
+	sha512h2	l1_\gh\()_q,l1_\cd\()_q,l1_\ab\()_v.2d
+.endm
+
+
+/*
+Variable list
+*/
+
+	declare_var_vector_reg	key,6
+
+
+/*
+digest variables
+*/
+	declare_var_vector_reg	l0_ab,0
+	declare_var_vector_reg	l0_cd,1
+	declare_var_vector_reg	l0_ef,2
+	declare_var_vector_reg	l0_gh,3
+	declare_var_vector_reg	l0_tmp,4
+
+	declare_var_vector_reg	l1_ab,8
+	declare_var_vector_reg	l1_cd,9
+	declare_var_vector_reg	l1_ef,10
+	declare_var_vector_reg	l1_gh,11
+	declare_var_vector_reg	l1_tmp,12
+
+
+	declare_var_vector_reg	l0_ab_saved,16
+	declare_var_vector_reg	l0_cd_saved,17
+	declare_var_vector_reg	l0_ef_saved,18
+	declare_var_vector_reg	l0_gh_saved,19
+	declare_var_vector_reg	l1_ab_saved,24
+	declare_var_vector_reg	l1_cd_saved,25
+	declare_var_vector_reg	l1_ef_saved,26
+	declare_var_vector_reg	l1_gh_saved,27
+/*
+Temporay variables
+*/
+	declare_var_vector_reg	l0_tmp0,5
+	declare_var_vector_reg	l0_tmp1,6
+	declare_var_vector_reg	l0_tmp2,7
+
+	declare_var_vector_reg	l1_tmp0,13
+	declare_var_vector_reg	l1_tmp1,14
+	declare_var_vector_reg	l1_tmp2,15
+
+
+
+/*
+Message variables
+*/
+	declare_var_vector_reg	l0_msg0,16
+	declare_var_vector_reg	l0_msg1,17
+	declare_var_vector_reg	l0_msg2,18
+	declare_var_vector_reg	l0_msg3,19
+	declare_var_vector_reg	l0_msg4,20
+	declare_var_vector_reg	l0_msg5,21
+	declare_var_vector_reg	l0_msg6,22
+	declare_var_vector_reg	l0_msg7,23
+
+	declare_var_vector_reg	l1_msg0,24
+	declare_var_vector_reg	l1_msg1,25
+	declare_var_vector_reg	l1_msg2,26
+	declare_var_vector_reg	l1_msg3,27
+	declare_var_vector_reg	l1_msg4,28
+	declare_var_vector_reg	l1_msg5,29
+	declare_var_vector_reg	l1_msg6,30
+	declare_var_vector_reg	l1_msg7,31
+
+
+
+/*
+	void sha512_mb_ce_x2(SHA512_JOB *, SHA512_JOB *, int);
+*/
+/*
+Arguements list
+*/
+	l0_job 	.req	x0
+	l1_job 	.req	x1
+	len	.req	w2
+	l0_data	.req	x3
+	l1_data	.req	x4
+	key_adr	.req	x5
+	l0_digest_adr .req x6
+	l1_digest_adr .req x7
+	.global	sha512_mb_ce_x2
+	.type	sha512_mb_ce_x2, %function
+sha512_mb_ce_x2:
+	//push d8~d15
+	stp 	d8,d9,[sp,-192]!
+	stp 	d10,d11,[sp,16]
+	stp 	d12,d13,[sp,32]
+	stp 	d14,d15,[sp,48]
+
+
+	ldr	l0_data, [l0_job]
+	ldr	l1_data, [l1_job]
+	// load initial digest
+	add	l0_digest_adr,l0_job,64
+	add	l1_digest_adr,l1_job,64
+	ld1	{l0_ab_v.4s-l0_gh_v.4s},[l0_digest_adr]
+	ld1	{l1_ab_v.4s-l1_gh_v.4s},[l1_digest_adr]
+
+
+
+start_loop:
+
+	adr	key_adr, KEY
+	//load msgs
+	ld1	{l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+	add	l0_data,l0_data,64
+	ld1	{l0_msg4_v.4s-l0_msg7_v.4s},[l0_data]
+	add	l0_data,l0_data,64
+
+	ld1	{l1_msg0_v.4s-l1_msg3_v.4s},[l1_data]
+	add	l1_data,l1_data,64
+	ld1	{l1_msg4_v.4s-l1_msg7_v.4s},[l1_data]
+	add	l1_data,l1_data,64
+
+	//adjust loop parameter
+	sub	len, len, #1
+	cmp	len, 0
+
+
+
+	//rev endian
+	rev64	l0_msg0_v.16b,l0_msg0_v.16b
+	rev64	l0_msg1_v.16b,l0_msg1_v.16b
+	rev64	l0_msg2_v.16b,l0_msg2_v.16b
+	rev64	l0_msg3_v.16b,l0_msg3_v.16b
+	rev64	l0_msg4_v.16b,l0_msg4_v.16b
+	rev64	l0_msg5_v.16b,l0_msg5_v.16b
+	rev64	l0_msg6_v.16b,l0_msg6_v.16b
+	rev64	l0_msg7_v.16b,l0_msg7_v.16b
+
+	rev64	l1_msg0_v.16b,l1_msg0_v.16b
+	rev64	l1_msg1_v.16b,l1_msg1_v.16b
+	rev64	l1_msg2_v.16b,l1_msg2_v.16b
+	rev64	l1_msg3_v.16b,l1_msg3_v.16b
+	rev64	l1_msg4_v.16b,l1_msg4_v.16b
+	rev64	l1_msg5_v.16b,l1_msg5_v.16b
+	rev64	l1_msg6_v.16b,l1_msg6_v.16b
+	rev64	l1_msg7_v.16b,l1_msg7_v.16b
+
+
+
+
+
+
+
+
+
+	sha512_rounds_low	 ab, cd, ef, gh,tmp,msg0,msg1,msg4,msg5,msg7	/* rounds  0- 1 */
+	sha512_rounds_low	 gh, ab,tmp, ef, cd,msg1,msg2,msg5,msg6,msg0	/* rounds  2- 3 */
+	sha512_rounds_low	 ef, gh, cd,tmp, ab,msg2,msg3,msg6,msg7,msg1	/* rounds  4- 5 */
+	sha512_rounds_low	tmp, ef, ab, cd, gh,msg3,msg4,msg7,msg0,msg2	/* rounds  6- 7 */
+	sha512_rounds_low	 cd,tmp, gh, ab, ef,msg4,msg5,msg0,msg1,msg3	/* rounds  8- 9 */
+	sha512_rounds_low	 ab, cd, ef, gh,tmp,msg5,msg6,msg1,msg2,msg4	/* rounds 10-11 */
+	sha512_rounds_low	 gh, ab,tmp, ef, cd,msg6,msg7,msg2,msg3,msg5	/* rounds 12-13 */
+	sha512_rounds_low	 ef, gh, cd,tmp, ab,msg7,msg0,msg3,msg4,msg6	/* rounds 14-15 */
+	sha512_rounds_low	tmp, ef, ab, cd, gh,msg0,msg1,msg4,msg5,msg7	/* rounds 16-17 */
+	sha512_rounds_low	 cd,tmp, gh, ab, ef,msg1,msg2,msg5,msg6,msg0	/* rounds 18-19 */
+	sha512_rounds_low	 ab, cd, ef, gh,tmp,msg2,msg3,msg6,msg7,msg1	/* rounds 20-21 */
+	sha512_rounds_low	 gh, ab,tmp, ef, cd,msg3,msg4,msg7,msg0,msg2	/* rounds 22-23 */
+	sha512_rounds_low	 ef, gh, cd,tmp, ab,msg4,msg5,msg0,msg1,msg3	/* rounds 24-25 */
+	sha512_rounds_low	tmp, ef, ab, cd, gh,msg5,msg6,msg1,msg2,msg4	/* rounds 26-27 */
+	sha512_rounds_low	 cd,tmp, gh, ab, ef,msg6,msg7,msg2,msg3,msg5	/* rounds 28-29 */
+	sha512_rounds_low	 ab, cd, ef, gh,tmp,msg7,msg0,msg3,msg4,msg6	/* rounds 30-31 */
+	sha512_rounds_low	 gh, ab,tmp, ef, cd,msg0,msg1,msg4,msg5,msg7	/* rounds 32-33 */
+	sha512_rounds_low	 ef, gh, cd,tmp, ab,msg1,msg2,msg5,msg6,msg0	/* rounds 34-35 */
+	sha512_rounds_low	tmp, ef, ab, cd, gh,msg2,msg3,msg6,msg7,msg1	/* rounds 36-37 */
+	sha512_rounds_low	 cd,tmp, gh, ab, ef,msg3,msg4,msg7,msg0,msg2	/* rounds 38-39 */
+	sha512_rounds_low	 ab, cd, ef, gh,tmp,msg4,msg5,msg0,msg1,msg3	/* rounds 40-41 */
+	sha512_rounds_low	 gh, ab,tmp, ef, cd,msg5,msg6,msg1,msg2,msg4	/* rounds 42-43 */
+	sha512_rounds_low	 ef, gh, cd,tmp, ab,msg6,msg7,msg2,msg3,msg5	/* rounds 44-45 */
+	sha512_rounds_low	tmp, ef, ab, cd, gh,msg7,msg0,msg3,msg4,msg6	/* rounds 46-47 */
+	sha512_rounds_low	 cd,tmp, gh, ab, ef,msg0,msg1,msg4,msg5,msg7	/* rounds 48-49 */
+	sha512_rounds_low	 ab, cd, ef, gh,tmp,msg1,msg2,msg5,msg6,msg0	/* rounds 50-51 */
+	sha512_rounds_low	 gh, ab,tmp, ef, cd,msg2,msg3,msg6,msg7,msg1	/* rounds 52-53 */
+	sha512_rounds_low	 ef, gh, cd,tmp, ab,msg3,msg4,msg7,msg0,msg2	/* rounds 54-55 */
+	sha512_rounds_low	tmp, ef, ab, cd, gh,msg4,msg5,msg0,msg1,msg3	/* rounds 56-57 */
+	sha512_rounds_low	 cd,tmp, gh, ab, ef,msg5,msg6,msg1,msg2,msg4	/* rounds 58-59 */
+	sha512_rounds_low	 ab, cd, ef, gh,tmp,msg6,msg7,msg2,msg3,msg5	/* rounds 60-61 */
+	sha512_rounds_low	 gh, ab,tmp, ef, cd,msg7,msg0,msg3,msg4,msg6	/* rounds 62-63 */
+	sha512_rounds_high	 ef, gh, cd,tmp, ab,msg0			/* rounds 64-65 */
+	sha512_rounds_high	tmp, ef, ab, cd, gh,msg1			/* rounds 66-67 */
+	sha512_rounds_high	 cd,tmp, gh, ab, ef,msg2			/* rounds 68-69 */
+	sha512_rounds_high	 ab, cd, ef, gh,tmp,msg3			/* rounds 70-71 */
+	ld1	{l0_ab_saved_v.4s-l0_gh_saved_v.4s},[l0_digest_adr]
+	ld1	{l1_ab_saved_v.4s-l1_gh_saved_v.4s},[l1_digest_adr]
+	sha512_rounds_high	 gh, ab,tmp, ef, cd,msg4			/* rounds 72-73 */
+	sha512_rounds_high	 ef, gh, cd,tmp, ab,msg5			/* rounds 74-75 */
+	sha512_rounds_high	tmp, ef, ab, cd, gh,msg6			/* rounds 76-77 */
+	sha512_rounds_high	 cd,tmp, gh, ab, ef,msg7			/* rounds 78-79 */
+
+
+
+	add	l0_ab_v.2d,l0_ab_v.2d,l0_ab_saved_v.2d
+	add	l0_cd_v.2d,l0_cd_v.2d,l0_cd_saved_v.2d
+	add	l0_ef_v.2d,l0_ef_v.2d,l0_ef_saved_v.2d
+	add	l0_gh_v.2d,l0_gh_v.2d,l0_gh_saved_v.2d
+	st1	{l0_ab_v.2d-l0_gh_v.2d},[l0_digest_adr]
+
+	add	l1_ab_v.2d,l1_ab_v.2d,l1_ab_saved_v.2d
+	add	l1_cd_v.2d,l1_cd_v.2d,l1_cd_saved_v.2d
+	add	l1_ef_v.2d,l1_ef_v.2d,l1_ef_saved_v.2d
+	add	l1_gh_v.2d,l1_gh_v.2d,l1_gh_saved_v.2d
+	st1	{l1_ab_v.2d-l1_gh_v.2d},[l1_digest_adr]
+
+
+
+
+	bgt	start_loop
+
+	add	x4,l0_job,64
+
+
+	ldp 	d10,d11,[sp,16]
+	ldp 	d12,d13,[sp,32]
+	ldp 	d14,d15,[sp,48]
+	ldp     d8, d9, [sp], 192
+
+	ret
+
+	.size	sha512_mb_ce_x2, .-sha512_mb_ce_x2
+	.section	.rol0_data.cst16,"aM",@progbits,16
+	.align	4
+KEY:
+	.quad		0x428a2f98d728ae22, 0x7137449123ef65cd
+	.quad		0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+	.quad		0x3956c25bf348b538, 0x59f111f1b605d019
+	.quad		0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+	.quad		0xd807aa98a3030242, 0x12835b0145706fbe
+	.quad		0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+	.quad		0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+	.quad		0x9bdc06a725c71235, 0xc19bf174cf692694
+	.quad		0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+	.quad		0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+	.quad		0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+	.quad		0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+	.quad		0x983e5152ee66dfab, 0xa831c66d2db43210
+	.quad		0xb00327c898fb213f, 0xbf597fc7beef0ee4
+	.quad		0xc6e00bf33da88fc2, 0xd5a79147930aa725
+	.quad		0x06ca6351e003826f, 0x142929670a0e6e70
+	.quad		0x27b70a8546d22ffc, 0x2e1b21385c26c926
+	.quad		0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+	.quad		0x650a73548baf63de, 0x766a0abb3c77b2a8
+	.quad		0x81c2c92e47edaee6, 0x92722c851482353b
+	.quad		0xa2bfe8a14cf10364, 0xa81a664bbc423001
+	.quad		0xc24b8b70d0f89791, 0xc76c51a30654be30
+	.quad		0xd192e819d6ef5218, 0xd69906245565a910
+	.quad		0xf40e35855771202a, 0x106aa07032bbd1b8
+	.quad		0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+	.quad		0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+	.quad		0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+	.quad		0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+	.quad		0x748f82ee5defb2fc, 0x78a5636f43172f60
+	.quad		0x84c87814a1f0ab72, 0x8cc702081a6439ec
+	.quad		0x90befffa23631e28, 0xa4506cebde82bde9
+	.quad		0xbef9a3f7b2c67915, 0xc67178f2e372532b
+	.quad		0xca273eceea26619c, 0xd186b8c721c0c207
+	.quad		0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+	.quad		0x06f067aa72176fba, 0x0a637dc5a2c898a6
+	.quad		0x113f9804bef90dae, 0x1b710b35131c471b
+	.quad		0x28db77f523047d84, 0x32caab7b40c72493
+	.quad		0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+	.quad		0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+	.quad		0x5fcb6fab3ad6faec, 0x6c44198c4a475817
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c
new file mode 100644
index 000000000..24d96763a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c
@@ -0,0 +1,269 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx")
+#endif
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_avx(SHA512_HASH_CTX_MGR * mgr)
+{
+	sha512_mb_mgr_init_avx(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_avx(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+					   const void *buffer, uint32_t len,
+					   HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx(&mgr->mgr,
+									   &ctx->job);
+		}
+	}
+
+	return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_avx(SHA512_HASH_CTX_MGR * mgr)
+{
+	SHA512_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_avx(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA512_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA512_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx(&mgr->mgr,
+										   &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx(&mgr->mgr,
+									   &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+	static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+	    { SHA512_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA512_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA512_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_avx_slver_02020166;
+struct slver sha512_ctx_mgr_init_avx_slver = { 0x0166, 0x02, 0x02 };
+
+struct slver sha512_ctx_mgr_submit_avx_slver_02020167;
+struct slver sha512_ctx_mgr_submit_avx_slver = { 0x0167, 0x02, 0x02 };
+
+struct slver sha512_ctx_mgr_flush_avx_slver_02020168;
+struct slver sha512_ctx_mgr_flush_avx_slver = { 0x0168, 0x02, 0x02 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c
new file mode 100644
index 000000000..9923e2097
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c
@@ -0,0 +1,269 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_avx2(SHA512_HASH_CTX_MGR * mgr)
+{
+	sha512_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_avx2(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+					    const void *buffer, uint32_t len,
+					    HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx2(&mgr->mgr,
+									    &ctx->job);
+		}
+	}
+
+	return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_avx2(SHA512_HASH_CTX_MGR * mgr)
+{
+	SHA512_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_avx2(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA512_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA512_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx2(&mgr->mgr,
+										    &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx2(&mgr->mgr,
+									    &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+	static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+	    { SHA512_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA512_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA512_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_avx2_slver_04020169;
+struct slver sha512_ctx_mgr_init_avx2_slver = { 0x0169, 0x02, 0x04 };
+
+struct slver sha512_ctx_mgr_submit_avx2_slver_04020170;
+struct slver sha512_ctx_mgr_submit_avx2_slver = { 0x0170, 0x02, 0x04 };
+
+struct slver sha512_ctx_mgr_flush_avx2_slver_04020171;
+struct slver sha512_ctx_mgr_flush_avx2_slver = { 0x0171, 0x02, 0x04 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c
new file mode 100644
index 000000000..5c0757716
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c
@@ -0,0 +1,274 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_avx512(SHA512_HASH_CTX_MGR * mgr)
+{
+	sha512_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_avx512(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+					      const void *buffer, uint32_t len,
+					      HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx512(&mgr->mgr,
+									      &ctx->job);
+		}
+	}
+
+	return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_avx512(SHA512_HASH_CTX_MGR * mgr)
+{
+	SHA512_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_avx512(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA512_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA512_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx =
+				    (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx512(&mgr->mgr,
+										    &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx512(&mgr->mgr,
+									      &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+	static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+	    { SHA512_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA512_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA512_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_avx512_slver_0600016a;
+struct slver sha512_ctx_mgr_init_avx512_slver = { 0x016a, 0x00, 0x06 };
+
+struct slver sha512_ctx_mgr_submit_avx512_slver_0600016b;
+struct slver sha512_ctx_mgr_submit_avx512_slver = { 0x016b, 0x00, 0x06 };
+
+struct slver sha512_ctx_mgr_flush_avx512_slver_0600016c;
+struct slver sha512_ctx_mgr_flush_avx512_slver = { 0x016c, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base.c
new file mode 100644
index 000000000..61a8fa000
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base.c
@@ -0,0 +1,323 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+/* From the FIPS, these are the same as for SHA256, but operating on 64 bit words
+ * instead of 32 bit.
+ */
+#define ch(e,f,g) ((e & f) ^ (g & ~e))
+#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c))
+
+/* Sigma functions have same form as SHA256 but
+ * 	- change the word size to 64bit
+ * 	- change the amount to rotate
+ */
+#define ror64(x, r) (((x)>>(r)) ^ ((x)<<(64-(r))))
+
+/* Technically, s0 should be S0 as these are "capital sigma" functions, and likewise the case
+ * of the  S0 should be s0, but keep as-is to avoid confusion with the other reference functions.
+ */
+#define s0(a) (ror64(a,28) ^ ror64(a,34) ^ ror64(a,39))
+#define s1(e) (ror64(e,14) ^ ror64(e,18) ^ ror64(e,41))
+
+#define S0(w) (ror64(w,1) ^ ror64(w,8) ^ (w >> 7))
+#define S1(w) (ror64(w,19) ^ ror64(w,61) ^ (w >> 6))
+
+#define W(x) w[(x) & 15]
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+	if (i<16) W(i) = to_be64(ww[i]); \
+	else \
+	W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+	t2 = s0(a) + maj(a,b,c); \
+	t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+	d += t1; \
+	h = t1 + t2;
+
+static void sha512_init(SHA512_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static uint32_t sha512_update(SHA512_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static void sha512_final(SHA512_HASH_CTX * ctx, uint32_t remain_len);
+static void sha512_single(const void *data, uint64_t digest[]);
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+
+void sha512_ctx_mgr_init_base(SHA512_HASH_CTX_MGR * mgr)
+{
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_base(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+					    const void *buffer, uint32_t len,
+					    HASH_CTX_FLAG flags)
+{
+	uint32_t remain_len;
+
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) {
+		// Cannot submit a new entire job to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags == HASH_FIRST) {
+
+		sha512_init(ctx, buffer, len);
+		sha512_update(ctx, buffer, len);
+	}
+
+	if (flags == HASH_UPDATE) {
+		sha512_update(ctx, buffer, len);
+	}
+
+	if (flags == HASH_LAST) {
+		remain_len = sha512_update(ctx, buffer, len);
+		sha512_final(ctx, remain_len);
+	}
+
+	if (flags == HASH_ENTIRE) {
+		sha512_init(ctx, buffer, len);
+		remain_len = sha512_update(ctx, buffer, len);
+		sha512_final(ctx, remain_len);
+	}
+
+	return ctx;
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_base(SHA512_HASH_CTX_MGR * mgr)
+{
+	return NULL;
+}
+
+static void sha512_init(SHA512_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+	// Init digest
+	hash_init_digest(ctx->job.result_digest);
+
+	// Reset byte counter
+	ctx->total_length = 0;
+
+	// Clear extra blocks
+	ctx->partial_block_buffer_length = 0;
+
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Mark it as processing
+	ctx->status = HASH_CTX_STS_PROCESSING;
+}
+
+static uint32_t sha512_update(SHA512_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+	uint32_t remain_len = len;
+	uint64_t *digest = ctx->job.result_digest;
+
+	while (remain_len >= SHA512_BLOCK_SIZE) {
+		sha512_single(buffer, digest);
+		buffer = (void *)((uint8_t *) buffer + SHA512_BLOCK_SIZE);
+		remain_len -= SHA512_BLOCK_SIZE;
+		ctx->total_length += SHA512_BLOCK_SIZE;
+	}
+	ctx->status = HASH_CTX_STS_IDLE;
+	ctx->incoming_buffer = buffer;
+	return remain_len;
+}
+
+static void sha512_final(SHA512_HASH_CTX * ctx, uint32_t remain_len)
+{
+	const void *buffer = ctx->incoming_buffer;
+	uint32_t i = remain_len, j;
+	uint8_t buf[2 * SHA512_BLOCK_SIZE];
+	uint64_t *digest = ctx->job.result_digest;
+
+	ctx->total_length += i;
+	memcpy(buf, buffer, i);
+	buf[i++] = 0x80;
+	for (j = i; j < (2 * SHA512_BLOCK_SIZE); j++)
+		buf[j] = 0;
+
+	if (i > SHA512_BLOCK_SIZE - SHA512_PADLENGTHFIELD_SIZE)
+		i = 2 * SHA512_BLOCK_SIZE;
+	else
+		i = SHA512_BLOCK_SIZE;
+
+	*(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8);
+
+	sha512_single(buf, digest);
+	if (i == 2 * SHA512_BLOCK_SIZE) {
+		sha512_single(buf + SHA512_BLOCK_SIZE, digest);
+	}
+
+	ctx->status = HASH_CTX_STS_COMPLETE;
+}
+
+void sha512_single(const void *data, uint64_t digest[])
+{
+	/* Check these are all uint64_t */
+	uint64_t a, b, c, d, e, f, g, h, t1, t2;
+	uint64_t w[16];
+	uint64_t *ww = (uint64_t *) data;
+
+	a = digest[0];
+	b = digest[1];
+	c = digest[2];
+	d = digest[3];
+	e = digest[4];
+	f = digest[5];
+	g = digest[6];
+	h = digest[7];
+
+	step(0, a, b, c, d, e, f, g, h, 0x428a2f98d728ae22);
+	step(1, h, a, b, c, d, e, f, g, 0x7137449123ef65cd);
+	step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcfec4d3b2f);
+	step(3, f, g, h, a, b, c, d, e, 0xe9b5dba58189dbbc);
+	step(4, e, f, g, h, a, b, c, d, 0x3956c25bf348b538);
+	step(5, d, e, f, g, h, a, b, c, 0x59f111f1b605d019);
+	step(6, c, d, e, f, g, h, a, b, 0x923f82a4af194f9b);
+	step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5da6d8118);
+	step(8, a, b, c, d, e, f, g, h, 0xd807aa98a3030242);
+	step(9, h, a, b, c, d, e, f, g, 0x12835b0145706fbe);
+	step(10, g, h, a, b, c, d, e, f, 0x243185be4ee4b28c);
+	step(11, f, g, h, a, b, c, d, e, 0x550c7dc3d5ffb4e2);
+	step(12, e, f, g, h, a, b, c, d, 0x72be5d74f27b896f);
+	step(13, d, e, f, g, h, a, b, c, 0x80deb1fe3b1696b1);
+	step(14, c, d, e, f, g, h, a, b, 0x9bdc06a725c71235);
+	step(15, b, c, d, e, f, g, h, a, 0xc19bf174cf692694);
+	step(16, a, b, c, d, e, f, g, h, 0xe49b69c19ef14ad2);
+	step(17, h, a, b, c, d, e, f, g, 0xefbe4786384f25e3);
+	step(18, g, h, a, b, c, d, e, f, 0x0fc19dc68b8cd5b5);
+	step(19, f, g, h, a, b, c, d, e, 0x240ca1cc77ac9c65);
+	step(20, e, f, g, h, a, b, c, d, 0x2de92c6f592b0275);
+	step(21, d, e, f, g, h, a, b, c, 0x4a7484aa6ea6e483);
+	step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dcbd41fbd4);
+	step(23, b, c, d, e, f, g, h, a, 0x76f988da831153b5);
+	step(24, a, b, c, d, e, f, g, h, 0x983e5152ee66dfab);
+	step(25, h, a, b, c, d, e, f, g, 0xa831c66d2db43210);
+	step(26, g, h, a, b, c, d, e, f, 0xb00327c898fb213f);
+	step(27, f, g, h, a, b, c, d, e, 0xbf597fc7beef0ee4);
+	step(28, e, f, g, h, a, b, c, d, 0xc6e00bf33da88fc2);
+	step(29, d, e, f, g, h, a, b, c, 0xd5a79147930aa725);
+	step(30, c, d, e, f, g, h, a, b, 0x06ca6351e003826f);
+	step(31, b, c, d, e, f, g, h, a, 0x142929670a0e6e70);
+	step(32, a, b, c, d, e, f, g, h, 0x27b70a8546d22ffc);
+	step(33, h, a, b, c, d, e, f, g, 0x2e1b21385c26c926);
+	step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc5ac42aed);
+	step(35, f, g, h, a, b, c, d, e, 0x53380d139d95b3df);
+	step(36, e, f, g, h, a, b, c, d, 0x650a73548baf63de);
+	step(37, d, e, f, g, h, a, b, c, 0x766a0abb3c77b2a8);
+	step(38, c, d, e, f, g, h, a, b, 0x81c2c92e47edaee6);
+	step(39, b, c, d, e, f, g, h, a, 0x92722c851482353b);
+	step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a14cf10364);
+	step(41, h, a, b, c, d, e, f, g, 0xa81a664bbc423001);
+	step(42, g, h, a, b, c, d, e, f, 0xc24b8b70d0f89791);
+	step(43, f, g, h, a, b, c, d, e, 0xc76c51a30654be30);
+	step(44, e, f, g, h, a, b, c, d, 0xd192e819d6ef5218);
+	step(45, d, e, f, g, h, a, b, c, 0xd69906245565a910);
+	step(46, c, d, e, f, g, h, a, b, 0xf40e35855771202a);
+	step(47, b, c, d, e, f, g, h, a, 0x106aa07032bbd1b8);
+	step(48, a, b, c, d, e, f, g, h, 0x19a4c116b8d2d0c8);
+	step(49, h, a, b, c, d, e, f, g, 0x1e376c085141ab53);
+	step(50, g, h, a, b, c, d, e, f, 0x2748774cdf8eeb99);
+	step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5e19b48a8);
+	step(52, e, f, g, h, a, b, c, d, 0x391c0cb3c5c95a63);
+	step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4ae3418acb);
+	step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f7763e373);
+	step(55, b, c, d, e, f, g, h, a, 0x682e6ff3d6b2b8a3);
+	step(56, a, b, c, d, e, f, g, h, 0x748f82ee5defb2fc);
+	step(57, h, a, b, c, d, e, f, g, 0x78a5636f43172f60);
+	step(58, g, h, a, b, c, d, e, f, 0x84c87814a1f0ab72);
+	step(59, f, g, h, a, b, c, d, e, 0x8cc702081a6439ec);
+	step(60, e, f, g, h, a, b, c, d, 0x90befffa23631e28);
+	step(61, d, e, f, g, h, a, b, c, 0xa4506cebde82bde9);
+	step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7b2c67915);
+	step(63, b, c, d, e, f, g, h, a, 0xc67178f2e372532b);	// step 63
+	step(64, a, b, c, d, e, f, g, h, 0xca273eceea26619c);
+	step(65, h, a, b, c, d, e, f, g, 0xd186b8c721c0c207);
+	step(66, g, h, a, b, c, d, e, f, 0xeada7dd6cde0eb1e);
+	step(67, f, g, h, a, b, c, d, e, 0xf57d4f7fee6ed178);
+	step(68, e, f, g, h, a, b, c, d, 0x06f067aa72176fba);
+	step(69, d, e, f, g, h, a, b, c, 0x0a637dc5a2c898a6);
+	step(70, c, d, e, f, g, h, a, b, 0x113f9804bef90dae);
+	step(71, b, c, d, e, f, g, h, a, 0x1b710b35131c471b);
+	step(72, a, b, c, d, e, f, g, h, 0x28db77f523047d84);
+	step(73, h, a, b, c, d, e, f, g, 0x32caab7b40c72493);
+	step(74, g, h, a, b, c, d, e, f, 0x3c9ebe0a15c9bebc);
+	step(75, f, g, h, a, b, c, d, e, 0x431d67c49c100d4c);
+	step(76, e, f, g, h, a, b, c, d, 0x4cc5d4becb3e42b6);
+	step(77, d, e, f, g, h, a, b, c, 0x597f299cfc657e2a);
+	step(78, c, d, e, f, g, h, a, b, 0x5fcb6fab3ad6faec);
+	step(79, b, c, d, e, f, g, h, a, 0x6c44198c4a475817);	// step 79
+
+	digest[0] += a;
+	digest[1] += b;
+	digest[2] += c;
+	digest[3] += d;
+	digest[4] += e;
+	digest[5] += f;
+	digest[6] += g;
+	digest[7] += h;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+	static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+	    { SHA512_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_base_slver_000002f3;
+struct slver sha512_ctx_mgr_init_base_slver = { 0x02f3, 0x00, 0x00 };
+
+struct slver sha512_ctx_mgr_submit_base_slver_000002f4;
+struct slver sha512_ctx_mgr_submit_base_slver = { 0x02f4, 0x00, 0x00 };
+
+struct slver sha512_ctx_mgr_flush_base_slver_000002f5;
+struct slver sha512_ctx_mgr_flush_base_slver = { 0x02f5, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base_aliases.c
new file mode 100644
index 000000000..9890c2c47
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base_aliases.c
@@ -0,0 +1,54 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdint.h>
+#include <string.h>
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+
+extern void sha512_ctx_mgr_init_base(SHA512_HASH_CTX_MGR * mgr);
+extern SHA512_HASH_CTX *sha512_ctx_mgr_submit_base(SHA512_HASH_CTX_MGR * mgr,
+						   SHA512_HASH_CTX * ctx, const void *buffer,
+						   uint32_t len, HASH_CTX_FLAG flags);
+extern SHA512_HASH_CTX *sha512_ctx_mgr_flush_base(SHA512_HASH_CTX_MGR * mgr);
+
+void sha512_ctx_mgr_init(SHA512_HASH_CTX_MGR * mgr)
+{
+	return sha512_ctx_mgr_init_base(mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+				       const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	return sha512_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush(SHA512_HASH_CTX_MGR * mgr)
+{
+	return sha512_ctx_mgr_flush_base(mgr);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c
new file mode 100644
index 000000000..94c32d260
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c
@@ -0,0 +1,255 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_sb_sse4(SHA512_HASH_CTX_MGR * mgr)
+{
+	sha512_sb_mgr_init_sse4(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_sb_sse4(SHA512_HASH_CTX_MGR * mgr,
+					       SHA512_HASH_CTX * ctx, const void *buffer,
+					       uint32_t len, HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_submit_sse4(&mgr->mgr,
+									    &ctx->job);
+		}
+	}
+
+	return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_sb_sse4(SHA512_HASH_CTX_MGR * mgr)
+{
+	SHA512_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_flush_sse4(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA512_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA512_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_submit_sse4(&mgr->mgr,
+										    &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_submit_sse4(&mgr->mgr,
+									    &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+	static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+	    { SHA512_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA512_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA512_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_sb_sse4_slver_05020172;
+struct slver sha512_ctx_mgr_init_sb_sse4_slver = { 0x0172, 0x02, 0x05 };
+
+struct slver sha512_ctx_mgr_submit_sb_sse4_slver_05020173;
+struct slver sha512_ctx_mgr_submit_sb_sse4_slver = { 0x0173, 0x02, 0x05 };
+
+struct slver sha512_ctx_mgr_flush_sb_sse4_slver_05020174;
+struct slver sha512_ctx_mgr_flush_sb_sse4_slver = { 0x0174, 0x02, 0x05 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c
new file mode 100644
index 000000000..b73619875
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c
@@ -0,0 +1,255 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_sse(SHA512_HASH_CTX_MGR * mgr)
+{
+	sha512_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_sse(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+					   const void *buffer, uint32_t len,
+					   HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx =
+			    (SHA512_HASH_CTX *) sha512_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_sse(SHA512_HASH_CTX_MGR * mgr)
+{
+	SHA512_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_sse(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+						SHA512_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SHA512_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SHA512_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_sse(&mgr->mgr,
+										   &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_sse(&mgr->mgr,
+									   &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+	static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+	    { SHA512_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SHA512_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SHA512_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_sse_slver_00020163;
+struct slver sha512_ctx_mgr_init_sse_slver = { 0x0163, 0x02, 0x00 };
+
+struct slver sha512_ctx_mgr_submit_sse_slver_00020164;
+struct slver sha512_ctx_mgr_submit_sse_slver = { 0x0164, 0x02, 0x00 };
+
+struct slver sha512_ctx_mgr_flush_sse_slver_00020165;
+struct slver sha512_ctx_mgr_flush_sse_slver = { 0x0165, 0x02, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm
new file mode 100644
index 000000000..4423cdcb5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm
@@ -0,0 +1,54 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN		0
+%define STS_BEING_PROCESSED	1
+%define STS_COMPLETED		2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA512_JOB structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS	; JOB_SHA512
+
+;;;	name				size	align
+FIELD	_buffer,			8,	8	; pointer to buffer
+FIELD	_len,				8,	8	; length in bytes
+FIELD	_result_digest,			8*8,	64	; Digest (output)
+FIELD	_status,			4,	4
+FIELD	_user_data,			8,	8
+
+%assign _SHA512_JOB_size	_FIELD_OFFSET
+%assign _SHA512_JOB_align	_STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..f54135da3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm
@@ -0,0 +1,72 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA512 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; LANE_DATA
+;;;     name            size    align
+FIELD   _job_in_lane,   8,      8       ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align        _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; SHA512_ARGS_X8
+;;;     name            size    align
+FIELD   _digest,        8*8*8,  4      ; transposed digest
+FIELD   _data_ptr,      8*8,    8       ; array of pointers to data
+END_FIELDS
+
+%assign _SHA512_ARGS_X4_size    _FIELD_OFFSET
+%assign _SHA512_ARGS_X4_align   _STRUCT_ALIGN
+%assign _SHA512_ARGS_X8_size    _FIELD_OFFSET
+%assign _SHA512_ARGS_X8_align   _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; MB_MGR
+;;;     name            size    align
+FIELD   _args,          _SHA512_ARGS_X4_size, _SHA512_ARGS_X4_align
+FIELD   _lens,          8*8,    8
+FIELD   _unused_lanes,  8,      8
+FIELD   _ldata,         _LANE_DATA_size*8, _LANE_DATA_align
+FIELD   _num_lanes_inuse, 4,    4
+END_FIELDS
+
+%assign _MB_MGR_size    _FIELD_OFFSET
+%assign _MB_MGR_align   _STRUCT_ALIGN
+
+_args_digest    equ     _args + _digest
+_args_data_ptr  equ     _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm
new file mode 100644
index 000000000..65ce43d3a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm
@@ -0,0 +1,224 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x2_avx
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rsi
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4            r8
+%define lens0           r8
+
+%define lens1           r9
+%define lens2           r10
+%define lens3           r11
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*3
+_ALIGN_SIZE     equ 0
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA512_JOB* sha512_mb_mgr_flush_avx(SHA512_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha512_mb_mgr_flush_avx, function
+sha512_mb_mgr_flush_avx:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*2], rsi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	bt      unused_lanes, 16+7
+	jc      return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+	cmp     qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [one]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 2
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4 + 8*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	mov     lens0, [state + _lens + 0*8]
+	mov     idx, lens0
+	mov     lens1, [state + _lens + 1*8]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xFF
+	jz      len_is_0
+
+	sub     lens0, len2
+	sub     lens1, len2
+	shr     len2, 32
+	mov     [state + _lens + 0*8], lens0
+	mov     [state + _lens + 1*8], lens1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha512_mb_x2_avx
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 8
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	vmovq    xmm0, [state + _args_digest + 8*idx + 0*32]
+	vpinsrq  xmm0, [state + _args_digest + 8*idx + 1*32], 1
+	vmovq    xmm1, [state + _args_digest + 8*idx + 2*32]
+	vpinsrq  xmm1, [state + _args_digest + 8*idx + 3*32], 1
+	vmovq    xmm2, [state + _args_digest + 8*idx + 4*32]
+	vpinsrq  xmm2, [state + _args_digest + 8*idx + 5*32], 1
+	vmovq    xmm3, [state + _args_digest + 8*idx + 6*32]
+	vpinsrq  xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
+	vmovdqa  [job_rax + _result_digest + 2*16], xmm2
+	vmovdqa  [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6,  [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7,  [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8,  [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9,  [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     r12, [rsp + _GPR_SAVE + 8*1]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+one:    dq  1
+two:    dq  2
+three:  dq  3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..33a24a6b9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm
@@ -0,0 +1,245 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x4_avx2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rsi
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4            r8
+%define lens0           r8
+
+%define lens1           r9
+%define lens2           r10
+%define lens3           r11
+
+struc stack_frame
+	.xmm: resb 16*10
+	.gpr: resb 8*5
+	.rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE       stack_frame.xmm
+%define _GPR_SAVE       stack_frame.gpr
+%define STACK_SPACE     stack_frame_size
+
+%define APPEND(a,b) a %+ b
+
+; SHA512_JOB* sha512_mb_mgr_flush_avx2(SHA512_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha512_mb_mgr_flush_avx2, function
+sha512_mb_mgr_flush_avx2:
+	endbranch
+
+	mov     rax, rsp
+
+	sub     rsp, STACK_SPACE
+	and     rsp, ~31
+
+	mov     [rsp + stack_frame.rsp], rax
+
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*2], rsi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	bt      unused_lanes, 32+7
+	jc      return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+	cmp     qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [one]
+	cmp     qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [two]
+	cmp     qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [three]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4 + 8*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	mov     lens0, [state + _lens + 0*8]
+	mov     idx, lens0
+	mov     lens1, [state + _lens + 1*8]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     lens2, [state + _lens + 2*8]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     lens3, [state + _lens + 3*8]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xFF
+	jz      len_is_0
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 32
+	mov     [state + _lens + 0*8], lens0
+	mov     [state + _lens + 1*8], lens1
+	mov     [state + _lens + 2*8], lens2
+	mov     [state + _lens + 3*8], lens3
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha512_mb_x4_avx2
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 8
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	vmovq    xmm0, [state + _args_digest + 8*idx + 0*32]
+	vpinsrq  xmm0, [state + _args_digest + 8*idx + 1*32], 1
+	vmovq    xmm1, [state + _args_digest + 8*idx + 2*32]
+	vpinsrq  xmm1, [state + _args_digest + 8*idx + 3*32], 1
+	vmovq    xmm2, [state + _args_digest + 8*idx + 4*32]
+	vpinsrq  xmm2, [state + _args_digest + 8*idx + 5*32], 1
+	vmovq    xmm3, [state + _args_digest + 8*idx + 6*32]
+	vpinsrq  xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
+	vmovdqa  [job_rax + _result_digest + 2*16], xmm2
+	vmovdqa  [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6,  [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7,  [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8,  [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9,  [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     r12, [rsp + _GPR_SAVE + 8*1]
+	mov	rsp, [rsp + stack_frame.rsp]
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+one:    dq  1
+two:    dq  2
+three:  dq  3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..795027c6b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm
@@ -0,0 +1,270 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+extern sha512_mb_x8_avx512
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rsi
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define num_lanes_inuse r9
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4            r8
+%define lens0           r8
+
+%define num_lanes_inuse r9
+%define lens1           r9
+%define lens2           r10
+%define lens3           r11
+
+struc stack_frame
+	.xmm: resb 16*10
+	.gpr: resb 8*8
+	.rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE       stack_frame.xmm
+%define _GPR_SAVE       stack_frame.gpr
+%define STACK_SPACE     stack_frame_size
+
+%define APPEND(a,b) a %+ b
+
+; SHA512_JOB* sha512_mb_mgr_flush_avx512(SHA512_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha512_mb_mgr_flush_avx512, function
+sha512_mb_mgr_flush_avx512:
+	endbranch
+
+	mov     rax, rsp
+
+	sub     rsp, STACK_SPACE
+
+	mov     [rsp + stack_frame.rsp], rax
+
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*3], rbp
+	mov     [rsp + _GPR_SAVE + 8*4], r12
+	mov     [rsp + _GPR_SAVE + 8*5], r13
+	mov     [rsp + _GPR_SAVE + 8*6], r14
+	mov     [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*1], rsi
+	mov     [rsp + _GPR_SAVE + 8*2], rdi
+	vmovdqu  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqu  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqu  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqu  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqu  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqu  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqu  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqu  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqu  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqu  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	cmp	num_lanes_inuse, 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+%assign I 1
+%rep 7
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4 + 8*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length, len in sha512_mgr is 64bit, high 32bit is block num, low 8bit is idx
+	vmovdqu ymm0, [state + _lens + 0*32]	; ymm0 has {D,d,C,c,B,b,A,a}
+	vmovdqu ymm1, [state + _lens + 1*32]
+
+	vpminuq ymm2, ymm0, ymm1	; ymm2 has {D,i,C,i,B,i,A,i}
+	vpalignr ymm3, ymm3, ymm2, 8	; ymm3 has {x,i,D,i,x,i,B,i}
+	vpminuq ymm2, ymm2, ymm3	; ymm2 has {x,i,F,i,x,i,E,i}
+	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has {x,i,x,i,x,i,F,i}
+	vpminuq ymm2, ymm2, ymm3	; ymm2 has min value in high dword
+
+	vmovq   idx, xmm2
+	mov     len2, idx
+	and     idx, 0xF
+	shr     len2, 32		; SHA512 blocksize is 1024bit
+	jz      len_is_0
+
+	vperm2i128 ymm2, ymm2, ymm2, 0	; ymm2 has {x,x,E,i,x,x,E,i}
+	vpand   ymm2, ymm2, [rel clear_low_nibble]	; ymm2 has {0,0,E,0,0,0,E,0}
+	vpshufd ymm2, ymm2, 0x44	; ymm2 has {E,0,E,0,E,0,E,0}
+
+	vpsubd  ymm0, ymm0, ymm2
+	vpsubd  ymm1, ymm1, ymm2
+
+	vmovdqu [state + _lens + 0*32], ymm0
+	vmovdqu [state + _lens + 1*32], ymm1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha512_mb_x8_avx512
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 8
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+        mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        sub     num_lanes_inuse, 1
+        mov     [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+	vmovq    xmm0, [state + _args_digest + 8*idx + 0*64]
+	vpinsrq  xmm0, [state + _args_digest + 8*idx + 1*64], 1
+	vmovq    xmm1, [state + _args_digest + 8*idx + 2*64]
+	vpinsrq  xmm1, [state + _args_digest + 8*idx + 3*64], 1
+	vmovq    xmm2, [state + _args_digest + 8*idx + 4*64]
+	vpinsrq  xmm2, [state + _args_digest + 8*idx + 5*64], 1
+	vmovq    xmm3, [state + _args_digest + 8*idx + 6*64]
+	vpinsrq  xmm3, [state + _args_digest + 8*idx + 7*64], 1
+
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
+	vmovdqa  [job_rax + _result_digest + 2*16], xmm2
+	vmovdqa  [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqu  xmm6,  [rsp + _XMM_SAVE + 16*0]
+	vmovdqu  xmm7,  [rsp + _XMM_SAVE + 16*1]
+	vmovdqu  xmm8,  [rsp + _XMM_SAVE + 16*2]
+	vmovdqu  xmm9,  [rsp + _XMM_SAVE + 16*3]
+	vmovdqu  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqu  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqu  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqu  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqu  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqu  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*1]
+	mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     rbp, [rsp + _GPR_SAVE + 8*3]
+	mov     r12, [rsp + _GPR_SAVE + 8*4]
+	mov     r13, [rsp + _GPR_SAVE + 8*5]
+	mov     r14, [rsp + _GPR_SAVE + 8*6]
+	mov     r15, [rsp + _GPR_SAVE + 8*7]
+
+	mov	rsp, [rsp + stack_frame.rsp]
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=32
+
+align 32
+clear_low_nibble:	; mgr len element 0xnnnnnnnn 0000000m, nnnnnnnn is blocknum, m is index
+	dq 0xFFFFFFFF00000000, 0x0000000000000000
+	dq 0xFFFFFFFF00000000, 0x0000000000000000
+lane_1:     dq  1
+lane_2:     dq  2
+lane_3:     dq  3
+lane_4:     dq  4
+lane_5:     dq  5
+lane_6:     dq  6
+lane_7:     dq  7
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha512_mb_mgr_flush_avx512
+no_sha512_mb_mgr_flush_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm
new file mode 100644
index 000000000..8a58bf879
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm
@@ -0,0 +1,227 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x2_sse
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx     rsi
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+%define tmp4            r8
+%define lens0           r8
+
+%define lens1           r9
+%define lens2           r10
+%define lens3           r11
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*3
+_ALIGN_SIZE     equ 0
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA512_JOB* sha512_mb_mgr_flush_sse(SHA512_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha512_mb_mgr_flush_sse, function
+sha512_mb_mgr_flush_sse:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*2], rsi
+	movdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	movdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	movdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	movdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	movdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	movdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	movdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	movdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	movdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	movdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+
+	mov     unused_lanes, [state + _unused_lanes]
+	bt      unused_lanes, 16+7
+	jc      return_null
+
+	; find a lane with a non-null job
+	xor     idx, idx
+	cmp     qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne  idx, [one]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov     tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 2
+	cmp     qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne     APPEND(skip_,I)
+	mov     [state + _args + _data_ptr + 8*I], tmp
+	mov     dword [state + _lens + 4 + 8*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	mov     lens0, [state + _lens + 0*8]
+	mov     idx, lens0
+	mov     lens1, [state + _lens + 1*8]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xFF
+	jz      len_is_0
+
+	sub     lens0, len2
+	sub     lens1, len2
+	shr     len2, 32
+	mov     [state + _lens + 0*8], lens0
+	mov     [state + _lens + 1*8], lens1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha512_mb_x2_sse
+	; state and idx are intact
+
+
+len_is_0:
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	mov     unused_lanes, [state + _unused_lanes]
+	shl     unused_lanes, 8
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	movq    xmm0, [state + _args_digest + 8*idx + 0*32]
+	pinsrq  xmm0, [state + _args_digest + 8*idx + 1*32], 1
+	movq    xmm1, [state + _args_digest + 8*idx + 2*32]
+	pinsrq  xmm1, [state + _args_digest + 8*idx + 3*32], 1
+	movq    xmm2, [state + _args_digest + 8*idx + 4*32]
+	pinsrq  xmm2, [state + _args_digest + 8*idx + 5*32], 1
+	movq    xmm3, [state + _args_digest + 8*idx + 6*32]
+	pinsrq  xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+
+	movdqa  [job_rax + _result_digest + 0*16], xmm0
+	movdqa  [job_rax + _result_digest + 1*16], xmm1
+	movdqa  [job_rax + _result_digest + 2*16], xmm2
+	movdqa  [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqa  xmm6,  [rsp + _XMM_SAVE + 16*0]
+	movdqa  xmm7,  [rsp + _XMM_SAVE + 16*1]
+	movdqa  xmm8,  [rsp + _XMM_SAVE + 16*2]
+	movdqa  xmm9,  [rsp + _XMM_SAVE + 16*3]
+	movdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	movdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	movdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	movdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	movdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	movdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     r12, [rsp + _GPR_SAVE + 8*1]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+one:    dq  1
+two:    dq  2
+three:  dq  3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..7ca997653
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c
@@ -0,0 +1,45 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+
+void sha512_mb_mgr_init_avx2(SHA512_MB_JOB_MGR * state)
+{
+	unsigned int j;
+
+	state->lens[0] = 0;
+	state->lens[1] = 1;
+	state->lens[2] = 2;
+	state->lens[3] = 3;
+	state->unused_lanes = 0xFF03020100;
+	state->num_lanes_inuse = 0;
+	for (j = 0; j < SHA512_X4_LANES; j++) {
+		state->ldata[j].job_in_lane = 0;
+	}
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c
new file mode 100644
index 000000000..bca9549d9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c
@@ -0,0 +1,42 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+
+void sha512_mb_mgr_init_avx512(SHA512_MB_JOB_MGR * state)
+{
+	unsigned int j;
+
+	state->unused_lanes = 0x0706050403020100;
+	state->num_lanes_inuse = 0;
+	for (j = 0; j < SHA512_MAX_LANES; j++) {
+		state->lens[j] = j;	// sha512_mb uses low 32bit of lens to hold idx exclusively
+		state->ldata[j].job_in_lane = 0;
+	}
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c
new file mode 100644
index 000000000..0e9ec257f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c
@@ -0,0 +1,43 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+
+void sha512_mb_mgr_init_sse(SHA512_MB_JOB_MGR * state)
+{
+	unsigned int j;
+
+	state->lens[0] = 0;
+	state->lens[1] = 1;
+	state->unused_lanes = 0xFF0100;
+	state->num_lanes_inuse = 0;
+	for (j = 0; j < SHA512_MIN_LANES; j++) {
+		state->ldata[j].job_in_lane = 0;
+	}
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm
new file mode 100644
index 000000000..1e3b1b1bd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm
@@ -0,0 +1,262 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x2_avx
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx             rdx ; rsi
+%define last_len        rdx ; rsi
+
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len        rsi
+%define idx             rsi
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+%define lens3           rbp
+
+%define extra_blocks    r8
+%define lens0           r8
+
+%define tmp             r9
+%define lens1           r9
+
+%define lane_data       r10
+%define lens2           r10
+
+struc stack_frame
+	.xmm: resb 16*10
+	.gpr: resb 8*5
+	.rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE       stack_frame.gpr
+%define _GPR_SAVE       stack_frame.rsp
+%define STACK_SPACE     stack_frame_size
+
+; SHA512_JOB* sha512_mb_mgr_submit_avx(SHA512_MB_JOB_MGR *state, SHA512_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha512_mb_mgr_submit_avx, function
+sha512_mb_mgr_submit_avx:
+	endbranch
+
+	mov	rax, rsp
+
+	sub     rsp, STACK_SPACE
+	and	rsp, ~31
+
+	mov	[rsp + stack_frame.rsp], rax
+
+	mov     [rsp + _XMM_SAVE + 8*0], rbx
+	mov     [rsp + _XMM_SAVE + 8*1], rbp
+	mov     [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _XMM_SAVE + 8*3], rsi
+	mov     [rsp + _XMM_SAVE + 8*4], rdi
+	vmovdqa  [rsp + 16*0], xmm6
+	vmovdqa  [rsp + 16*1], xmm7
+	vmovdqa  [rsp + 16*2], xmm8
+	vmovdqa  [rsp + 16*3], xmm9
+	vmovdqa  [rsp + 16*4], xmm10
+	vmovdqa  [rsp + 16*5], xmm11
+	vmovdqa  [rsp + 16*6], xmm12
+	vmovdqa  [rsp + 16*7], xmm13
+	vmovdqa  [rsp + 16*8], xmm14
+	vmovdqa  [rsp + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	movzx   lane, BYTE(unused_lanes)
+	shr     unused_lanes, 8
+	imul    lane_data, lane, _LANE_DATA_size
+	mov     dword [job + _status], STS_BEING_PROCESSED
+	lea     lane_data, [state + _ldata + lane_data]
+	mov     [state + _unused_lanes], unused_lanes
+	mov     DWORD(len), [job + _len]
+
+	mov     [lane_data + _job_in_lane], job
+	mov     [state + _lens + 4 + 8*lane], DWORD(len)
+
+
+	; Load digest words from result_digest
+	vmovdqa	xmm0, [job + _result_digest + 0*16]
+	vmovdqa	xmm1, [job + _result_digest + 1*16]
+	vmovdqa	xmm2, [job + _result_digest + 2*16]
+	vmovdqa	xmm3, [job + _result_digest + 3*16]
+	vmovq    [state + _args_digest + 8*lane + 0*32], xmm0
+	vpextrq  [state + _args_digest + 8*lane + 1*32], xmm0, 1
+	vmovq    [state + _args_digest + 8*lane + 2*32], xmm1
+	vpextrq  [state + _args_digest + 8*lane + 3*32], xmm1, 1
+	vmovq    [state + _args_digest + 8*lane + 4*32], xmm2
+	vpextrq  [state + _args_digest + 8*lane + 5*32], xmm2, 1
+	vmovq    [state + _args_digest + 8*lane + 6*32], xmm3
+	vpextrq  [state + _args_digest + 8*lane + 7*32], xmm3, 1
+
+	mov     p, [job + _buffer]
+	mov     [state + _args_data_ptr + 8*lane], p
+
+	add     dword [state + _num_lanes_inuse], 1
+	cmp     unused_lanes, 0xff
+	jne     return_null
+
+start_loop:
+
+	; Find min length
+	mov     lens0, [state + _lens + 0*8]
+	mov     idx, lens0
+	mov     lens1, [state + _lens + 1*8]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xFF
+	jz      len_is_0
+
+	sub     lens0, len2
+	sub     lens1, len2
+	shr     len2, 32
+	mov     [state + _lens + 0*8], lens0
+	mov     [state + _lens + 1*8], lens1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha512_mb_x2_avx
+	; state and idx are intact
+
+len_is_0:
+
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+
+	mov     unused_lanes, [state + _unused_lanes]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	shl     unused_lanes, 8
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub      dword [state + _num_lanes_inuse], 1
+
+	vmovq    xmm0, [state + _args_digest + 8*idx + 0*32]
+	vpinsrq  xmm0, [state + _args_digest + 8*idx + 1*32], 1
+	vmovq    xmm1, [state + _args_digest + 8*idx + 2*32]
+	vpinsrq  xmm1, [state + _args_digest + 8*idx + 3*32], 1
+	vmovq    xmm2, [state + _args_digest + 8*idx + 4*32]
+	vpinsrq  xmm2, [state + _args_digest + 8*idx + 5*32], 1
+	vmovq    xmm3, [state + _args_digest + 8*idx + 6*32]
+	vpinsrq  xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
+	vmovdqa  [job_rax + _result_digest + 2*16], xmm2
+	vmovdqa  [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6,  [rsp + 16*0]
+	vmovdqa  xmm7,  [rsp + 16*1]
+	vmovdqa  xmm8,  [rsp + 16*2]
+	vmovdqa  xmm9,  [rsp + 16*3]
+	vmovdqa  xmm10, [rsp + 16*4]
+	vmovdqa  xmm11, [rsp + 16*5]
+	vmovdqa  xmm12, [rsp + 16*6]
+	vmovdqa  xmm13, [rsp + 16*7]
+	vmovdqa  xmm14, [rsp + 16*8]
+	vmovdqa  xmm15, [rsp + 16*9]
+	mov     rsi, [rsp + _XMM_SAVE + 8*3]
+	mov     rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+	mov     rbx, [rsp + _XMM_SAVE + 8*0]
+	mov     rbp, [rsp + _XMM_SAVE + 8*1]
+	mov     r12, [rsp + _XMM_SAVE + 8*2]
+	mov	rsp, [rsp + stack_frame.rsp]
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+H0:     dd  0x6a09e667
+H1:     dd  0xbb67ae85
+H2:     dd  0x3c6ef372
+H3:     dd  0xa54ff53a
+H4:     dd  0x510e527f
+H5:     dd  0x9b05688c
+H6:     dd  0x1f83d9ab
+H7:     dd  0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..c425c5bb9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm
@@ -0,0 +1,270 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x4_avx2
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx             rdx ; rsi
+%define last_len        rdx ; rsi
+
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len        rsi
+%define idx             rsi
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+%define lens3           rbp
+
+%define extra_blocks    r8
+%define lens0           r8
+
+%define tmp             r9
+%define lens1           r9
+
+%define lane_data       r10
+%define lens2           r10
+
+struc stack_frame
+	.xmm: resb 16*10
+	.gpr: resb 8*5
+	.rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE       stack_frame.gpr
+%define _GPR_SAVE       stack_frame.rsp
+%define STACK_SPACE     stack_frame_size
+
+; SHA512_JOB* sha512_mb_mgr_submit_avx2(SHA512_MB_JOB_MGR *state, SHA512_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha512_mb_mgr_submit_avx2, function
+sha512_mb_mgr_submit_avx2:
+	endbranch
+
+	mov     rax, rsp
+
+	sub     rsp, STACK_SPACE
+	and     rsp, ~31
+
+	mov     [rsp + stack_frame.rsp], rax
+
+	mov     [rsp + _XMM_SAVE + 8*0], rbx
+	mov     [rsp + _XMM_SAVE + 8*1], rbp
+	mov     [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _XMM_SAVE + 8*3], rsi
+	mov     [rsp + _XMM_SAVE + 8*4], rdi
+	vmovdqa  [rsp + 16*0], xmm6
+	vmovdqa  [rsp + 16*1], xmm7
+	vmovdqa  [rsp + 16*2], xmm8
+	vmovdqa  [rsp + 16*3], xmm9
+	vmovdqa  [rsp + 16*4], xmm10
+	vmovdqa  [rsp + 16*5], xmm11
+	vmovdqa  [rsp + 16*6], xmm12
+	vmovdqa  [rsp + 16*7], xmm13
+	vmovdqa  [rsp + 16*8], xmm14
+	vmovdqa  [rsp + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	movzx   lane, BYTE(unused_lanes)
+	shr     unused_lanes, 8
+	imul    lane_data, lane, _LANE_DATA_size
+	mov     dword [job + _status], STS_BEING_PROCESSED
+	lea     lane_data, [state + _ldata + lane_data]
+	mov     [state + _unused_lanes], unused_lanes
+	mov     DWORD(len), [job + _len]
+
+	mov     [lane_data + _job_in_lane], job
+	mov     [state + _lens + 4 + 8*lane], DWORD(len)
+
+
+	; Load digest words from result_digest
+	vmovdqa  xmm0, [job + _result_digest + 0*16]
+	vmovdqa  xmm1, [job + _result_digest + 1*16]
+	vmovdqa  xmm2, [job + _result_digest + 2*16]
+	vmovdqa  xmm3, [job + _result_digest + 3*16]
+	vmovq    [state + _args_digest + 8*lane + 0*32], xmm0
+	vpextrq  [state + _args_digest + 8*lane + 1*32], xmm0, 1
+	vmovq    [state + _args_digest + 8*lane + 2*32], xmm1
+	vpextrq  [state + _args_digest + 8*lane + 3*32], xmm1, 1
+	vmovq    [state + _args_digest + 8*lane + 4*32], xmm2
+	vpextrq  [state + _args_digest + 8*lane + 5*32], xmm2, 1
+	vmovq    [state + _args_digest + 8*lane + 6*32], xmm3
+	vpextrq  [state + _args_digest + 8*lane + 7*32], xmm3, 1
+
+	mov     p, [job + _buffer]
+	mov     [state + _args_data_ptr + 8*lane], p
+
+	add     dword [state + _num_lanes_inuse], 1
+	cmp     unused_lanes, 0xff
+	jne     return_null
+
+start_loop:
+
+	; Find min length
+	mov     lens0, [state + _lens + 0*8]
+	mov     idx, lens0
+	mov     lens1, [state + _lens + 1*8]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+	mov     lens2, [state + _lens + 2*8]
+	cmp     lens2, idx
+	cmovb   idx, lens2
+	mov     lens3, [state + _lens + 3*8]
+	cmp     lens3, idx
+	cmovb   idx, lens3
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xFF
+	jz      len_is_0
+
+	sub     lens0, len2
+	sub     lens1, len2
+	sub     lens2, len2
+	sub     lens3, len2
+	shr     len2, 32
+	mov     [state + _lens + 0*8], lens0
+	mov     [state + _lens + 1*8], lens1
+	mov     [state + _lens + 2*8], lens2
+	mov     [state + _lens + 3*8], lens3
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha512_mb_x4_avx2
+	; state and idx are intact
+
+len_is_0:
+
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+
+
+	mov     unused_lanes, [state + _unused_lanes]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	shl     unused_lanes, 8
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	vmovq    xmm0, [state + _args_digest + 8*idx + 0*32]
+	vpinsrq  xmm0, [state + _args_digest + 8*idx + 1*32], 1
+	vmovq    xmm1, [state + _args_digest + 8*idx + 2*32]
+	vpinsrq  xmm1, [state + _args_digest + 8*idx + 3*32], 1
+	vmovq    xmm2, [state + _args_digest + 8*idx + 4*32]
+	vpinsrq  xmm2, [state + _args_digest + 8*idx + 5*32], 1
+	vmovq    xmm3, [state + _args_digest + 8*idx + 6*32]
+	vpinsrq  xmm3, [state + _args_digest + 8*idx + 7*32], 1
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
+	vmovdqa  [job_rax + _result_digest + 2*16], xmm2
+	vmovdqa  [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6,  [rsp + 16*0]
+	vmovdqa  xmm7,  [rsp + 16*1]
+	vmovdqa  xmm8,  [rsp + 16*2]
+	vmovdqa  xmm9,  [rsp + 16*3]
+	vmovdqa  xmm10, [rsp + 16*4]
+	vmovdqa  xmm11, [rsp + 16*5]
+	vmovdqa  xmm12, [rsp + 16*6]
+	vmovdqa  xmm13, [rsp + 16*7]
+	vmovdqa  xmm14, [rsp + 16*8]
+	vmovdqa  xmm15, [rsp + 16*9]
+	mov     rsi, [rsp + _XMM_SAVE + 8*3]
+	mov     rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+	mov     rbx, [rsp + _XMM_SAVE + 8*0]
+	mov     rbp, [rsp + _XMM_SAVE + 8*1]
+	mov     r12, [rsp + _XMM_SAVE + 8*2]
+	mov	rsp, [rsp + stack_frame.rsp]
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+H0:     dd  0x6a09e667
+H1:     dd  0xbb67ae85
+H2:     dd  0x3c6ef372
+H3:     dd  0xa54ff53a
+H4:     dd  0x510e527f
+H5:     dd  0x9b05688c
+H6:     dd  0x1f83d9ab
+H7:     dd  0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..23b1b5c27
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm
@@ -0,0 +1,280 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+extern sha512_mb_x8_avx512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx             rdx ; rsi
+%define last_len        rdx ; rsi
+
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len        rsi
+%define idx             rsi
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+%define lens3           rbp
+
+%define extra_blocks    r8
+%define lens0           r8
+
+%define num_lanes_inuse r9
+%define tmp             r9
+%define lens1           r9
+
+%define lane_data       r10
+%define lens2           r10
+
+struc stack_frame
+	.xmm: resb 16*10
+	.gpr: resb 8*8
+	.rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE       stack_frame.gpr
+%define _GPR_SAVE       stack_frame.rsp
+%define STACK_SPACE     stack_frame_size
+
+; SHA512_JOB* sha512_mb_mgr_submit_avx512(SHA512_MB_JOB_MGR *state, SHA512_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha512_mb_mgr_submit_avx512, function
+sha512_mb_mgr_submit_avx512:
+	endbranch
+
+	mov     rax, rsp
+
+	sub     rsp, STACK_SPACE
+
+	mov     [rsp + stack_frame.rsp], rax
+
+	mov     [rsp + _XMM_SAVE + 8*0], rbx
+	mov     [rsp + _XMM_SAVE + 8*1], rbp
+	mov     [rsp + _XMM_SAVE + 8*2], r12
+	mov     [rsp + _XMM_SAVE + 8*5], r13
+	mov     [rsp + _XMM_SAVE + 8*6], r14
+	mov     [rsp + _XMM_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _XMM_SAVE + 8*3], rsi
+	mov     [rsp + _XMM_SAVE + 8*4], rdi
+	vmovdqu  [rsp + 16*0], xmm6
+	vmovdqu  [rsp + 16*1], xmm7
+	vmovdqu  [rsp + 16*2], xmm8
+	vmovdqu  [rsp + 16*3], xmm9
+	vmovdqu  [rsp + 16*4], xmm10
+	vmovdqu  [rsp + 16*5], xmm11
+	vmovdqu  [rsp + 16*6], xmm12
+	vmovdqu  [rsp + 16*7], xmm13
+	vmovdqu  [rsp + 16*8], xmm14
+	vmovdqu  [rsp + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	movzx   lane, BYTE(unused_lanes)
+	shr     unused_lanes, 8
+	imul    lane_data, lane, _LANE_DATA_size
+	mov     dword [job + _status], STS_BEING_PROCESSED
+	lea     lane_data, [state + _ldata + lane_data]
+	mov     [state + _unused_lanes], unused_lanes
+	mov     DWORD(len), [job + _len]
+
+	mov     [lane_data + _job_in_lane], job
+	mov     [state + _lens + 4 + 8*lane], DWORD(len)
+
+
+	; Load digest words from result_digest
+	vmovdqa  xmm0, [job + _result_digest + 0*16]
+	vmovdqa  xmm1, [job + _result_digest + 1*16]
+	vmovdqa  xmm2, [job + _result_digest + 2*16]
+	vmovdqa  xmm3, [job + _result_digest + 3*16]
+	vmovq    [state + _args_digest + 8*lane + 0*64], xmm0
+	vpextrq  [state + _args_digest + 8*lane + 1*64], xmm0, 1
+	vmovq    [state + _args_digest + 8*lane + 2*64], xmm1
+	vpextrq  [state + _args_digest + 8*lane + 3*64], xmm1, 1
+	vmovq    [state + _args_digest + 8*lane + 4*64], xmm2
+	vpextrq  [state + _args_digest + 8*lane + 5*64], xmm2, 1
+	vmovq    [state + _args_digest + 8*lane + 6*64], xmm3
+	vpextrq  [state + _args_digest + 8*lane + 7*64], xmm3, 1
+
+	mov     p, [job + _buffer]
+	mov     [state + _args_data_ptr + 8*lane], p
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        add     num_lanes_inuse, 1
+	mov	[state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+        cmp     num_lanes_inuse, 8
+	jne     return_null
+
+start_loop:
+	; Find min length, len in sha512_mgr is 64bit, high 32bit is block num, low 8bit is idx
+	vmovdqu ymm0, [state + _lens + 0*32]	; ymm0 has {D,d,C,c,B,b,A,a}
+	vmovdqu ymm1, [state + _lens + 1*32]
+
+	vpminuq ymm2, ymm0, ymm1	; ymm2 has {D,i,C,i,B,i,A,i}
+	vpalignr ymm3, ymm3, ymm2, 8	; ymm3 has {x,i,D,i,x,i,B,i}
+	vpminuq ymm2, ymm2, ymm3	; ymm2 has {x,i,F,i,x,i,E,i}
+	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has {x,i,x,i,x,i,F,i}
+	vpminuq ymm2, ymm2, ymm3	; ymm2 has min value in high dword
+
+	vmovq   idx, xmm2
+	mov     len2, idx
+	and     idx, 0xF
+	shr     len2, 32
+	jz      len_is_0
+
+
+	vperm2i128 ymm2, ymm2, ymm2, 0	; ymm2 has {x,x,E,i,x,x,E,i}
+	vpand   ymm2, ymm2, [rel clear_low_nibble]	; ymm2 has {0,0,E,0,0,0,E,0}
+	vpshufd ymm2, ymm2, 0x44	; ymm2 has {E,0,E,0,E,0,E,0}
+
+	vpsubd  ymm0, ymm0, ymm2
+	vpsubd  ymm1, ymm1, ymm2
+
+	vmovdqu [state + _lens + 0*32], ymm0
+	vmovdqu [state + _lens + 1*32], ymm1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha512_mb_x8_avx512
+	; state and idx are intact
+
+len_is_0:
+
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+
+
+	mov     unused_lanes, [state + _unused_lanes]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	shl     unused_lanes, 8
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+        mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        sub     num_lanes_inuse, 1
+        mov     [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+	vmovq    xmm0, [state + _args_digest + 8*idx + 0*64]
+	vpinsrq  xmm0, [state + _args_digest + 8*idx + 1*64], 1
+	vmovq    xmm1, [state + _args_digest + 8*idx + 2*64]
+	vpinsrq  xmm1, [state + _args_digest + 8*idx + 3*64], 1
+	vmovq    xmm2, [state + _args_digest + 8*idx + 4*64]
+	vpinsrq  xmm2, [state + _args_digest + 8*idx + 5*64], 1
+	vmovq    xmm3, [state + _args_digest + 8*idx + 6*64]
+	vpinsrq  xmm3, [state + _args_digest + 8*idx + 7*64], 1
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+	vmovdqa  [job_rax + _result_digest + 1*16], xmm1
+	vmovdqa  [job_rax + _result_digest + 2*16], xmm2
+	vmovdqa  [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqu  xmm6,  [rsp + 16*0]
+	vmovdqu  xmm7,  [rsp + 16*1]
+	vmovdqu  xmm8,  [rsp + 16*2]
+	vmovdqu  xmm9,  [rsp + 16*3]
+	vmovdqu  xmm10, [rsp + 16*4]
+	vmovdqu  xmm11, [rsp + 16*5]
+	vmovdqu  xmm12, [rsp + 16*6]
+	vmovdqu  xmm13, [rsp + 16*7]
+	vmovdqu  xmm14, [rsp + 16*8]
+	vmovdqu  xmm15, [rsp + 16*9]
+	mov     rsi, [rsp + _XMM_SAVE + 8*3]
+	mov     rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+	mov     rbx, [rsp + _XMM_SAVE + 8*0]
+	mov     rbp, [rsp + _XMM_SAVE + 8*1]
+	mov     r12, [rsp + _XMM_SAVE + 8*2]
+	mov     r13, [rsp + _XMM_SAVE + 8*5]
+	mov     r14, [rsp + _XMM_SAVE + 8*6]
+	mov     r15, [rsp + _XMM_SAVE + 8*7]
+
+	mov	rsp, [rsp + stack_frame.rsp]
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=32
+
+align 32
+clear_low_nibble:	; mgr len element 0xnnnnnnnn 0000000m, nnnnnnnn is blocknum, m is index
+	dq 0xFFFFFFFF00000000, 0x0000000000000000
+	dq 0xFFFFFFFF00000000, 0x0000000000000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha512_mb_mgr_submit_avx512
+no_sha512_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm
new file mode 100644
index 000000000..ba12d586b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm
@@ -0,0 +1,260 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x2_sse
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx             rdx ; rsi
+%define last_len        rdx ; rsi
+
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len        rsi
+%define idx             rsi
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+%define lens3           rbp
+
+%define extra_blocks    r8
+%define lens0           r8
+
+%define tmp             r9
+%define lens1           r9
+
+%define lane_data       r10
+%define lens2           r10
+
+struc stack_frame
+	.xmm: resb 16*10
+	.gpr: resb 8*5
+	.rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE       stack_frame.gpr
+%define _GPR_SAVE       stack_frame.rsp
+%define STACK_SPACE     stack_frame_size
+
+; SHA512_JOB* sha512_mb_mgr_submit_sse(SHA512_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha512_mb_mgr_submit_sse, function
+sha512_mb_mgr_submit_sse:
+	endbranch
+
+	mov	rax, rsp
+
+	sub     rsp, STACK_SPACE
+	and	rsp, ~31
+
+	mov	[rsp + stack_frame.rsp], rax
+
+	mov     [rsp + _XMM_SAVE + 8*0], rbx
+	mov     [rsp + _XMM_SAVE + 8*1], rbp
+	mov     [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _XMM_SAVE + 8*3], rsi
+	mov     [rsp + _XMM_SAVE + 8*4], rdi
+	movdqa  [rsp + 16*0], xmm6
+	movdqa  [rsp + 16*1], xmm7
+	movdqa  [rsp + 16*2], xmm8
+	movdqa  [rsp + 16*3], xmm9
+	movdqa  [rsp + 16*4], xmm10
+	movdqa  [rsp + 16*5], xmm11
+	movdqa  [rsp + 16*6], xmm12
+	movdqa  [rsp + 16*7], xmm13
+	movdqa  [rsp + 16*8], xmm14
+	movdqa  [rsp + 16*9], xmm15
+%endif
+
+	mov     unused_lanes, [state + _unused_lanes]
+	movzx   lane, BYTE(unused_lanes)
+	shr     unused_lanes, 8
+	imul    lane_data, lane, _LANE_DATA_size
+	mov     dword [job + _status], STS_BEING_PROCESSED
+	lea     lane_data, [state + _ldata + lane_data]
+	mov     [state + _unused_lanes], unused_lanes
+	mov     DWORD(len), [job + _len]
+
+	mov     [lane_data + _job_in_lane], job
+	mov     [state + _lens + 4 + 8*lane], DWORD(len)
+
+	; Load digest words from result_digest
+	movdqa	xmm0, [job + _result_digest + 0*16]
+	movdqa	xmm1, [job + _result_digest + 1*16]
+	movdqa	xmm2, [job + _result_digest + 2*16]
+	movdqa	xmm3, [job + _result_digest + 3*16]
+	movq    [state + _args_digest + 8*lane + 0*32], xmm0
+	pextrq  [state + _args_digest + 8*lane + 1*32], xmm0, 1
+	movq    [state + _args_digest + 8*lane + 2*32], xmm1
+	pextrq  [state + _args_digest + 8*lane + 3*32], xmm1, 1
+	movq    [state + _args_digest + 8*lane + 4*32], xmm2
+	pextrq  [state + _args_digest + 8*lane + 5*32], xmm2, 1
+	movq    [state + _args_digest + 8*lane + 6*32], xmm3
+	pextrq  [state + _args_digest + 8*lane + 7*32], xmm3, 1
+
+	mov     p, [job + _buffer]
+	mov     [state + _args_data_ptr + 8*lane], p
+
+	add     dword [state + _num_lanes_inuse], 1
+	cmp     unused_lanes, 0xff
+	jne     return_null
+
+start_loop:
+
+	; Find min length
+	mov     lens0, [state + _lens + 0*8]
+	mov     idx, lens0
+	mov     lens1, [state + _lens + 1*8]
+	cmp     lens1, idx
+	cmovb   idx, lens1
+
+	mov     len2, idx
+	and     idx, 0xF
+	and     len2, ~0xFF
+	jz      len_is_0
+
+	sub     lens0, len2
+	sub     lens1, len2
+	shr     len2, 32
+	mov     [state + _lens + 0*8], lens0
+	mov     [state + _lens + 1*8], lens1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call    sha512_mb_x2_sse
+	; state and idx are intact
+
+len_is_0:
+
+	; process completed job "idx"
+	imul    lane_data, idx, _LANE_DATA_size
+	lea     lane_data, [state + _ldata + lane_data]
+
+	mov     job_rax, [lane_data + _job_in_lane]
+
+	mov     unused_lanes, [state + _unused_lanes]
+	mov     qword [lane_data + _job_in_lane], 0
+	mov     dword [job_rax + _status], STS_COMPLETED
+	shl     unused_lanes, 8
+	or      unused_lanes, idx
+	mov     [state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	movq    xmm0, [state + _args_digest + 8*idx + 0*32]
+	pinsrq  xmm0, [state + _args_digest + 8*idx + 1*32], 1
+	movq    xmm1, [state + _args_digest + 8*idx + 2*32]
+	pinsrq  xmm1, [state + _args_digest + 8*idx + 3*32], 1
+	movq    xmm2, [state + _args_digest + 8*idx + 4*32]
+	pinsrq  xmm2, [state + _args_digest + 8*idx + 5*32], 1
+	movq    xmm3, [state + _args_digest + 8*idx + 6*32]
+	pinsrq  xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+	movdqa  [job_rax + _result_digest + 0*16], xmm0
+	movdqa  [job_rax + _result_digest + 1*16], xmm1
+	movdqa  [job_rax + _result_digest + 2*16], xmm2
+	movdqa  [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	movdqa  xmm6,  [rsp + 16*0]
+	movdqa  xmm7,  [rsp + 16*1]
+	movdqa  xmm8,  [rsp + 16*2]
+	movdqa  xmm9,  [rsp + 16*3]
+	movdqa  xmm10, [rsp + 16*4]
+	movdqa  xmm11, [rsp + 16*5]
+	movdqa  xmm12, [rsp + 16*6]
+	movdqa  xmm13, [rsp + 16*7]
+	movdqa  xmm14, [rsp + 16*8]
+	movdqa  xmm15, [rsp + 16*9]
+	mov     rsi, [rsp + _XMM_SAVE + 8*3]
+	mov     rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+	mov     rbx, [rsp + _XMM_SAVE + 8*0]
+	mov     rbp, [rsp + _XMM_SAVE + 8*1]
+	mov     r12, [rsp + _XMM_SAVE + 8*2]
+	mov	rsp, [rsp + stack_frame.rsp]
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+H0:     dd  0x6a09e667
+H1:     dd  0xbb67ae85
+H2:     dd  0x3c6ef372
+H3:     dd  0xa54ff53a
+H4:     dd  0x510e527f
+H5:     dd  0x9b05688c
+H6:     dd  0x1f83d9ab
+H7:     dd  0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c
new file mode 100644
index 000000000..74fa0384a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c
@@ -0,0 +1,160 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha512_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][8 * SHA512_DIGEST_NWORDS];
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	SHA512_HASH_CTX_MGR *mgr = NULL;
+	SHA512_HASH_CTX ctxpool[TEST_BUFS];
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t i, j, fail = 0;
+	uint32_t lens[TEST_BUFS];
+	unsigned int jobs, t;
+	int ret;
+
+	printf("multibinary_sha512 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+	       TEST_LEN);
+
+	srand(TEST_SEED);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha512_ctx_mgr_init(mgr);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocate and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// SSL test
+		SHA512(bufs[i], TEST_LEN, digest_ssl[i]);
+
+		// sb_sha512 test
+		sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+	}
+
+	while (sha512_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_be64(((uint64_t *) digest_ssl[i])[j])) {
+				fail++;
+				printf("Test%d, digest%d fail %016lX <=> %016lX\n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       to_be64(((uint64_t *) digest_ssl[i])[j]));
+			}
+		}
+	}
+	putchar('.');
+
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		sha512_ctx_mgr_init(mgr);
+
+		for (i = 0; i < jobs; i++) {
+			// Random buffer with random len and contents
+			lens[i] = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], lens[i]);
+
+			// Run SSL test
+			SHA512(bufs[i], lens[i], digest_ssl[i]);
+
+			// Run sb_sha512 test
+			sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+		}
+
+		while (sha512_ctx_mgr_flush(mgr)) ;
+
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] !=
+				    to_be64(((uint64_t *) digest_ssl[i])[j])) {
+					fail++;
+					printf("Test%d, digest%d fail %016lX <=> %016lX\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       to_be64(((uint64_t *) digest_ssl[i])[j]));
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha512_ssl rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c
new file mode 100644
index 000000000..f71d06df8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c
@@ -0,0 +1,203 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha512_mb.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint64_t digest_ref[TEST_BUFS][SHA512_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha512_ref(uint8_t * input_data, uint64_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	SHA512_HASH_CTX_MGR *mgr = NULL;
+	SHA512_HASH_CTX ctxpool[TEST_BUFS];
+	uint32_t i, j, fail = 0;
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t lens[TEST_BUFS];
+	unsigned int jobs, t;
+	uint8_t *tmp_buf;
+	int ret;
+
+	printf("multibinary_sha512 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+	       TEST_LEN);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha512_ctx_mgr_init(mgr);
+
+	srand(TEST_SEED);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocate  and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contexts
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// Run reference test
+		sha512_ref(bufs[i], digest_ref[i], TEST_LEN);
+
+		// Run sb_sha512 test
+		sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+	}
+
+	while (sha512_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("Test%d fixed size, digest%d "
+				       "fail 0x%016lX <=> 0x%016lX \n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+
+	if (fail) {
+		printf("Test failed function check %d\n", fail);
+		return fail;
+	}
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		sha512_ctx_mgr_init(mgr);
+
+		for (i = 0; i < jobs; i++) {
+			// Use buffer with random len and contents
+			lens[i] = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], lens[i]);
+
+			// Run reference test
+			sha512_ref(bufs[i], digest_ref[i], lens[i]);
+
+			// Run sha512_mb test
+			sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+		}
+
+		while (sha512_ctx_mgr_flush(mgr)) ;
+
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+					fail++;
+					printf("Test%d, digest%d fail "
+					       "0x%016lX <=> 0x%016lX\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       digest_ref[i][j]);
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	// Test at the end of buffer
+	jobs = rand() % TEST_BUFS;
+	tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+	if (!tmp_buf) {
+		printf("malloc failed, end test aborted.\n");
+		return 1;
+	}
+
+	rand_buffer(tmp_buf, jobs);
+
+	sha512_ctx_mgr_init(mgr);
+
+	// Extend to the end of allocated buffer to construct jobs
+	for (i = 0; i < jobs; i++) {
+		bufs[i] = (uint8_t *) & tmp_buf[i];
+		lens[i] = jobs - i;
+
+		// Reference test
+		sha512_ref(bufs[i], digest_ref[i], lens[i]);
+
+		// sb_sha512 test
+		sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+	}
+
+	while (sha512_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < jobs; i++) {
+		for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("End test failed at offset %d - result: 0x%016lX"
+				       ", ref: 0x%016lX\n", i, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+
+	putchar('.');
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha512 rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c
new file mode 100644
index 000000000..383c45cd2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c
@@ -0,0 +1,300 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha512_mb.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE		13*SHA512_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS 	(TEST_LEN/(16*SHA512_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint64_t digest_ref[TEST_BUFS][SHA512_DIGEST_NWORDS];
+
+extern void sha512_ref(uint8_t * input_data, uint64_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	SHA512_HASH_CTX_MGR *mgr = NULL;
+	SHA512_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+	uint32_t i, j, fail = 0;
+	int len_done, len_rem, len_rand;
+	unsigned char *bufs[TEST_BUFS];
+	unsigned char *buf_ptr[TEST_BUFS];
+	uint32_t lens[TEST_BUFS];
+	unsigned int joblen, jobs, t;
+	int ret;
+
+	printf("multibinary_sha512_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+	       TEST_LEN);
+
+	srand(TEST_SEED);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha512_ctx_mgr_init(mgr);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocte and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		buf_ptr[i] = bufs[i];
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// Run reference test
+		sha512_ref(bufs[i], digest_ref[i], TEST_LEN);
+	}
+
+	// Run sb_sha512 tests
+	for (i = 0; i < TEST_BUFS;) {
+		len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+		len_rem = TEST_LEN - len_done;
+
+		if (len_done == 0)
+			ctx = sha512_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+		else if (len_rem <= UPDATE_SIZE)
+			ctx = sha512_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], len_rem, HASH_LAST);
+		else
+			ctx = sha512_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+		// Add jobs while available or finished
+		if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+			i++;
+			continue;
+		}
+		// Resubmit unfinished job
+		i = (unsigned long)(ctx->user_data);
+		buf_ptr[i] += UPDATE_SIZE;
+	}
+
+	// Start flushing finished jobs, end on last flushed
+	ctx = sha512_ctx_mgr_flush(mgr);
+	while (ctx) {
+		if (hash_ctx_complete(ctx)) {
+			debug_char('-');
+			ctx = sha512_ctx_mgr_flush(mgr);
+			continue;
+		}
+		// Resubmit unfinished job
+		i = (unsigned long)(ctx->user_data);
+		buf_ptr[i] += UPDATE_SIZE;
+
+		len_done = (int)((unsigned long)buf_ptr[i]
+				 - (unsigned long)bufs[i]);
+		len_rem = TEST_LEN - len_done;
+
+		if (len_rem <= UPDATE_SIZE)
+			ctx = sha512_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], len_rem, HASH_LAST);
+		else
+			ctx = sha512_ctx_mgr_submit(mgr,
+						    &ctxpool[i],
+						    buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+		if (ctx == NULL)
+			ctx = sha512_ctx_mgr_flush(mgr);
+	}
+
+	// Check digests
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+				fail++;
+				printf("Test%d fixed size, digest%d fail %8lX <=> %8lX",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       digest_ref[i][j]);
+			}
+		}
+	}
+	putchar('.');
+
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		for (i = 0; i < jobs; i++) {
+			joblen = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], joblen);
+			lens[i] = joblen;
+			buf_ptr[i] = bufs[i];
+			sha512_ref(bufs[i], digest_ref[i], lens[i]);
+		}
+
+		sha512_ctx_mgr_init(mgr);
+
+		// Run sha512_sb jobs
+		i = 0;
+		while (i < jobs) {
+			// Submit a new job
+			len_rand = SHA512_BLOCK_SIZE +
+			    SHA512_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+			if (lens[i] > len_rand)
+				ctx = sha512_ctx_mgr_submit(mgr,
+							    &ctxpool[i],
+							    buf_ptr[i], len_rand, HASH_FIRST);
+			else
+				ctx = sha512_ctx_mgr_submit(mgr,
+							    &ctxpool[i],
+							    buf_ptr[i], lens[i], HASH_ENTIRE);
+
+			// Returned ctx could be:
+			//  - null context (we are just getting started and lanes aren't full yet), or
+			//  - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+			//  - an unfinished ctx, we will resubmit
+
+			if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+				i++;
+				continue;
+			} else {
+				// unfinished ctx returned, choose another random update length and submit either
+				// UPDATE or LAST depending on the amount of buffer remaining
+				while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+					j = (unsigned long)(ctx->user_data);	// Get index of the returned ctx
+					buf_ptr[j] = bufs[j] + ctx->total_length;
+					len_rand = (rand() % SHA512_BLOCK_SIZE)
+					    * (rand() % MAX_RAND_UPDATE_BLOCKS);
+					len_rem = lens[j] - ctx->total_length;
+
+					if (len_rem <= len_rand)	// submit the rest of the job as LAST
+						ctx = sha512_ctx_mgr_submit(mgr,
+									    &ctxpool[j],
+									    buf_ptr[j],
+									    len_rem,
+									    HASH_LAST);
+					else	// submit the random update length as UPDATE
+						ctx = sha512_ctx_mgr_submit(mgr,
+									    &ctxpool[j],
+									    buf_ptr[j],
+									    len_rand,
+									    HASH_UPDATE);
+				}	// Either continue submitting any contexts returned here as UPDATE/LAST, or
+				// go back to submitting new jobs using the index i.
+
+				i++;
+			}
+		}
+
+		// Start flushing finished jobs, end on last flushed
+		ctx = sha512_ctx_mgr_flush(mgr);
+		while (ctx) {
+			if (hash_ctx_complete(ctx)) {
+				debug_char('-');
+				ctx = sha512_ctx_mgr_flush(mgr);
+				continue;
+			}
+			// Resubmit unfinished job
+			i = (unsigned long)(ctx->user_data);
+			buf_ptr[i] = bufs[i] + ctx->total_length;	// update buffer pointer
+			len_rem = lens[i] - ctx->total_length;
+			len_rand = (rand() % SHA512_BLOCK_SIZE)
+			    * (rand() % MAX_RAND_UPDATE_BLOCKS);
+			debug_char('+');
+			if (len_rem <= len_rand)
+				ctx = sha512_ctx_mgr_submit(mgr,
+							    &ctxpool[i],
+							    buf_ptr[i], len_rem, HASH_LAST);
+			else
+				ctx = sha512_ctx_mgr_submit(mgr,
+							    &ctxpool[i],
+							    buf_ptr[i], len_rand, HASH_UPDATE);
+
+			if (ctx == NULL)
+				ctx = sha512_ctx_mgr_flush(mgr);
+		}
+
+		// Check result digest
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+					fail++;
+					printf("Test%d, digest%d fail %8lX <=> %8lX\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       digest_ref[i][j]);
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sha512_update rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c
new file mode 100644
index 000000000..a84e7af3e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c
@@ -0,0 +1,270 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sha512_mb.h"
+
+typedef uint64_t DigestSHA512[SHA512_DIGEST_NWORDS];
+
+#define MSGS 8
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+static uint8_t msg1[] = "The quick brown fox jumps over the lazy dog";
+static uint8_t msg2[] = "The quick brown fox jumps over the lazy dog.";
+static uint8_t msg3[] = { 0x0a, 0x55, 0xdb, 0 };
+static uint8_t msg4[] = { 0xba, 0xd7, 0xc6, 0x18, 0xf4, 0x5b, 0xe2, 0x07, 0x97, 0x5e, 0 };
+
+static uint8_t msg5[] = {
+	0xb1, 0x71, 0x5f, 0x78, 0x2f, 0xf0, 0x2c, 0x6b, 0x88, 0x93,
+	0x7f, 0x05, 0x41, 0x16, 0
+};
+
+static uint8_t msg6[] = {
+	0xc6, 0xa1, 0x70, 0x93, 0x65, 0x68, 0x65, 0x10, 0x20, 0xed,
+	0xfe, 0x15, 0xdf, 0x80, 0x12, 0xac, 0xda, 0x8d, 0
+};
+
+static uint8_t msg7[] = {
+	0xa8, 0xa3, 0x7d, 0xfc, 0x08, 0x3a, 0xd2, 0xf4, 0x7f, 0xff,
+	0x46, 0x87, 0x38, 0xbf, 0x8b, 0x72, 0x8e, 0xb7, 0xf1, 0x90,
+	0x7e, 0x42, 0x7f, 0xa1, 0x5c, 0xb4, 0x42, 0x4b, 0xc6, 0x85,
+	0xe5, 0x5e, 0xd7, 0xb2, 0x82, 0x5c, 0x9c, 0x60, 0xb8, 0x39,
+	0xcc, 0xc2, 0xfe, 0x5f, 0xb3, 0x3e, 0x36, 0xf5, 0x70, 0xcb,
+	0x86, 0x61, 0x60, 0x9e, 0x63, 0x0b, 0xda, 0x05, 0xee, 0x64,
+	0x1d, 0x93, 0x84, 0x28, 0x86, 0x7d, 0x90, 0xe0, 0x07, 0x44,
+	0xa4, 0xaa, 0xd4, 0x94, 0xc9, 0x3c, 0x5f, 0x6d, 0x13, 0x27,
+	0x87, 0x80, 0x78, 0x59, 0x0c, 0xdc, 0xe1, 0xe6, 0x47, 0xc9,
+	0x82, 0x08, 0x18, 0xf4, 0x67, 0x64, 0x1f, 0xcd, 0x50, 0x8e,
+	0x2f, 0x2e, 0xbf, 0xd0, 0xff, 0x3d, 0x4f, 0x27, 0x23, 0x93,
+	0x47, 0x8f, 0x3b, 0x9e, 0x6f, 0x80, 0x6b, 0x43, 0
+};
+
+static uint8_t msg8[] = "";
+
+static DigestSHA512 expResultDigest1 = {
+	0x07e547d9586f6a73, 0xf73fbac0435ed769, 0x51218fb7d0c8d788, 0xa309d785436bbb64,
+	0x2e93a252a954f239, 0x12547d1e8a3b5ed6, 0xe1bfd7097821233f, 0xa0538f3db854fee6
+};
+
+static DigestSHA512 expResultDigest2 = {
+	0x91ea1245f20d46ae, 0x9a037a989f54f1f7, 0x90f0a47607eeb8a1, 0x4d12890cea77a1bb,
+	0xc6c7ed9cf205e67b, 0x7f2b8fd4c7dfd3a7, 0xa8617e45f3c463d4, 0x81c7e586c39ac1ed
+};
+
+static DigestSHA512 expResultDigest3 = {
+	0x7952585e5330cb24, 0x7d72bae696fc8a6b, 0x0f7d0804577e347d, 0x99bc1b11e52f3849,
+	0x85a428449382306a, 0x89261ae143c2f3fb, 0x613804ab20b42dc0, 0x97e5bf4a96ef919b
+};
+
+static DigestSHA512 expResultDigest4 = {
+	0x5886828959d1f822, 0x54068be0bd14b6a8, 0x8f59f534061fb203, 0x76a0541052dd3635,
+	0xedf3c6f0ca3d0877, 0x5e13525df9333a21, 0x13c0b2af76515887, 0x529910b6c793c8a5
+};
+
+static DigestSHA512 expResultDigest5 = {
+	0xee1a56ee78182ec4, 0x1d2c3ab33d4c4187, 0x1d437c5c1ca060ee, 0x9e219cb83689b4e5,
+	0xa4174dfdab5d1d10, 0x96a31a7c8d3abda7, 0x5c1b5e6da97e1814, 0x901c505b0bc07f25
+};
+
+static DigestSHA512 expResultDigest6 = {
+	0xc36c100cdb6c8c45, 0xb072f18256d63a66, 0xc9843acb4d07de62, 0xe0600711d4fbe64c,
+	0x8cf314ec3457c903, 0x08147cb7ac7e4d07, 0x3ba10f0ced78ea72, 0x4a474b32dae71231
+};
+
+static DigestSHA512 expResultDigest7 = {
+	0x8e1c91729be8eb40, 0x226f6c58a029380e, 0xf7edb9dc166a5c3c, 0xdbcefe90bd30d85c,
+	0xb7c4b248e66abf0a, 0x3a4c842281299bef, 0x6db88858d9e5ab52, 0x44f70b7969e1c072
+};
+
+static DigestSHA512 expResultDigest8 = {
+	0Xcf83e1357eefb8bd, 0Xf1542850d66d8007, 0Xd620e4050b5715dc, 0X83f4a921d36ce9ce,
+	0X47d0d13c5d85f2b0, 0Xff8318d2877eec2f, 0X63b931bd47417a81, 0Xa538327af927da3e
+};
+
+static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7, msg8 };
+
+static uint64_t *expResultDigest[MSGS] = { expResultDigest1, expResultDigest2,
+	expResultDigest3, expResultDigest4, expResultDigest5, expResultDigest6,
+	expResultDigest7, expResultDigest8
+};
+
+int main(void)
+{
+	SHA512_HASH_CTX_MGR *mgr = NULL;
+	SHA512_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+	uint32_t i, j, k, t, checked = 0;
+	uint64_t *good;
+	int ret;
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha512_ctx_mgr_init(mgr);
+
+	// Init contexts before first use
+	for (i = 0; i < MSGS; i++) {
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	for (i = 0; i < MSGS; i++) {
+		ctx = sha512_ctx_mgr_submit(mgr,
+					    &ctxpool[i],
+					    msgs[i], strlen((char *)msgs[i]), HASH_ENTIRE);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			good = expResultDigest[t];
+			checked++;
+			for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %016lX, "
+					       "should be %016lX\n", t, j,
+					       ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the"
+				       " submit. Error code: %d", ctx->error);
+				return -1;
+			}
+		}
+	}
+
+	while (1) {
+		ctx = sha512_ctx_mgr_flush(mgr);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			good = expResultDigest[t];
+			checked++;
+			for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %016lX, "
+					       "should be %016lX\n", t, j,
+					       ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the "
+				       "submit. Error code: %d", ctx->error);
+				return -1;
+			}
+		} else {
+			break;
+		}
+	}
+
+	// do larger test in pseudo-random order
+
+	// Init contexts before first use
+	for (i = 0; i < NUM_JOBS; i++) {
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	checked = 0;
+	for (i = 0; i < NUM_JOBS; i++) {
+		j = PSEUDO_RANDOM_NUM(i);
+
+		ctx = sha512_ctx_mgr_submit(mgr,
+					    &ctxpool[i],
+					    msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+			good = expResultDigest[k];
+			checked++;
+			for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %016lX, "
+					       "should be %016lX\n", t, j,
+					       ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the"
+				       " submit. Error code: %d", ctx->error);
+				return -1;
+			}
+
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+		}
+	}
+	while (1) {
+		ctx = sha512_ctx_mgr_flush(mgr);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+			good = expResultDigest[k];
+			checked++;
+			for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %016lX, "
+					       "should be %016lX\n", t, j,
+					       ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the"
+				       " submit. Error code: %d", ctx->error);
+				return -1;
+			}
+		} else {
+			break;
+		}
+	}
+
+	if (checked != NUM_JOBS) {
+		printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+		return -1;
+	}
+
+	printf(" multibinary_sha512 test: Pass\n");
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..87d7837f6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c
@@ -0,0 +1,129 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha512_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+#  define TEST_LEN     4*1024
+#  define TEST_LOOPS   1000
+#  define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     (GT_L3_CACHE / TEST_BUFS)
+#  define TEST_LOOPS   10
+#  define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][8 * SHA512_DIGEST_NWORDS];
+
+int main(void)
+{
+	SHA512_HASH_CTX_MGR *mgr = NULL;
+	SHA512_HASH_CTX ctxpool[TEST_BUFS];
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t i, j, t, fail = 0;
+	struct perf start, stop;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+		if (bufs[i] == NULL) {
+			printf("calloc failed test aborted\n");
+			return 1;
+		}
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+	if (ret) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	sha512_ctx_mgr_init(mgr);
+
+	// Start OpenSSL tests
+	perf_start(&start);
+	for (t = 0; t < TEST_LOOPS; t++) {
+		for (i = 0; i < TEST_BUFS; i++)
+			SHA512(bufs[i], TEST_LEN, digest_ssl[i]);
+	}
+	perf_stop(&stop);
+
+	printf("sha512_openssl" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+	// Start mb tests
+	perf_start(&start);
+	for (t = 0; t < TEST_LOOPS; t++) {
+		for (i = 0; i < TEST_BUFS; i++)
+			sha512_ctx_mgr_submit(mgr,
+					      &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+		while (sha512_ctx_mgr_flush(mgr)) ;
+	}
+	perf_stop(&stop);
+
+	printf("multibinary_sha512" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_be64(((uint64_t *) digest_ssl[i])[j])) {
+				fail++;
+				printf("Test%d, digest%d fail %016lX <=> %016lX\n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       to_be64(((uint64_t *) digest_ssl[i])[j]));
+			}
+		}
+	}
+
+	printf("Multi-buffer sha512 test complete %d buffers of %d B with "
+	       "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf("multibinary_sha512_ossl_perf: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm
new file mode 100644
index 000000000..5d443faf7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm
@@ -0,0 +1,442 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute SHA512 by-2 using AVX
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers:  rax	  rdx		  r8 r9 r10 r11
+;; Windows preserves:	  rbx rcx     rsi rdi rbp		r12 r13 r14 r15
+;;
+;; Linux clobbers:    rax	      rsi	  r8 r9 r10 r11
+;; Linux preserves:	  rbx rcx rdx	  rdi rbp		r12 r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+%define SHA512_DIGEST_WORD_SIZE 8
+%define NUM_SHA512_DIGEST_WORDS 8
+%define SHA512_DIGEST_ROW_SIZE 8*4
+%define PTR_SZ 8
+%define _data_ptr_sha512 _data_ptr
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux definitions
+%define arg1	rdi
+%define arg2	rsi
+%else
+; Windows definitions
+%define arg1	rcx
+%define arg2	rdx
+%endif
+
+; Common definitions
+%define STATE	 arg1
+%define INP_SIZE arg2
+
+%define IDX	rax
+%define ROUND	r8
+%define TBL	r11
+
+%define inp0 r9
+%define inp1 r10
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1  xmm14
+%define TMP xmm15
+
+%define SZ2	2*SHA512_DIGEST_WORD_SIZE	; Size of one vector register
+%define ROUNDS 80*SZ2
+
+; Define stack usage
+
+struc STACK
+_DATA:		resb	SZ2 * 16
+_DIGEST:	resb	SZ2 * NUM_SHA512_DIGEST_WORDS
+		resb	8	; for alignment, must be odd multiple of 8
+endstruc
+
+%define VMOVPD	vmovupd
+
+; transpose r0, r1, t0
+; Input looks like {r0 r1}
+; r0 = {a1 a0}
+; r1 = {b1 b0}
+;
+; output looks like
+; r0 = {b0, a0}
+; t0 = {b1, a1}
+
+%macro TRANSPOSE 3
+%define %%r0 %1
+%define %%r1 %2
+%define %%t0 %3
+	vshufpd	%%t0, %%r0, %%r1, 11b	; t0 = b1 a1
+	vshufpd	%%r0, %%r0, %%r1, 00b	; r0 = b0 a0
+%endm
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORQ reg, imm, tmp
+; packed-rotate-right-double
+; does a rotate by doing two shifts and an or
+%macro PRORQ 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpsllq	%%tmp, %%reg, (64-(%%imm))
+	vpsrlq	%%reg, %%reg, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORQ_nd reg, imm, tmp, src
+%macro PRORQ_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	vpsllq	%%tmp, %%src, (64-(%%imm))
+	vpsrlq	%%reg, %%src, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; PRORQ dst/src, amt
+%macro PRORQ 2
+	PRORQ	%1, %2, TMP
+%endmacro
+
+; PRORQ_nd dst, src, amt
+%macro PRORQ_nd 3
+	PRORQ_nd	%1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i  %2
+	PRORQ_nd a0, e, (18-14)	; sig1: a0 = (e >> 4)
+
+	vpxor	a2, f, g	; ch: a2 = f^g
+	vpand	a2, a2, e	; ch: a2 = (f^g)&e
+	vpxor	a2, a2, g	; a2 = ch
+
+	PRORQ_nd a1, e, 41	; sig1: a1 = (e >> 41)
+	vmovdqa	[SZ2*(%%i&0xf) + rsp + _DATA],%%T1
+	vpaddq	%%T1,%%T1,[TBL + ROUND]	; T1 = W + K
+	vpxor	a0, a0, e	; sig1: a0 = e ^ (e >> 5)
+	PRORQ	a0, 14		; sig1: a0 = (e >> 14) ^ (e >> 18)
+	vpaddq	h, h, a2	; h = h + ch
+	PRORQ_nd a2, a, (34-28)	; sig0: a2 = (a >> 6)
+	vpaddq	h, h, %%T1	; h = h + ch + W + K
+	vpxor	a0, a0, a1	; a0 = sigma1
+	vmovdqa	%%T1, a		; maj: T1 = a
+	PRORQ_nd a1, a, 39	; sig0: a1 = (a >> 39)
+	vpxor	%%T1, %%T1, c	; maj: T1 = a^c
+	add	ROUND, SZ2 ; ROUND++
+	vpand	%%T1, %%T1, b	; maj: T1 = (a^c)&b
+	vpaddq	h, h, a0
+
+	vpaddq	d, d, h
+
+	vpxor	a2, a2, a	; sig0: a2 = a ^ (a >> 11)
+	PRORQ	a2, 28		; sig0: a2 = (a >> 28) ^ (a >> 34)
+	vpxor	a2, a2, a1	; a2 = sig0
+	vpand	a1, a, c	; maj: a1 = a&c
+	vpor	a1, a1, %%T1	; a1 = maj
+	vpaddq	h, h, a1	; h = h + ch + W + K + maj
+	vpaddq	h, h, a2	; h = h + ch + W + K + maj + sigma0
+	ROTATE_ARGS
+%endm
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i  %2
+	vmovdqa	%%T1, [SZ2*((%%i-15)&0xf) + rsp + _DATA]
+	vmovdqa	a1, [SZ2*((%%i-2)&0xf) + rsp + _DATA]
+	vmovdqa	a0, %%T1
+	PRORQ	%%T1, 8-1
+	vmovdqa	a2, a1
+	PRORQ	a1, 61-19
+	vpxor	%%T1, %%T1, a0
+	PRORQ	%%T1, 1
+	vpxor	a1, a1, a2
+	PRORQ	a1, 19
+	vpsrlq	a0, a0, 7
+	vpxor	%%T1, %%T1, a0
+	vpsrlq	a2, a2, 6
+	vpxor	a1, a1, a2
+	vpaddq	%%T1, %%T1, [SZ2*((%%i-16)&0xf) + rsp + _DATA]
+	vpaddq	a1, a1, [SZ2*((%%i-7)&0xf) + rsp + _DATA]
+	vpaddq	%%T1, %%T1, a1
+
+	ROUND_00_15 %%T1, %%i
+%endm
+
+;; void sha512_mb_x2_avx(SHA512_MB_ARGS_X4 *args, uint64_t msg_size_in_blocks)
+;; arg 1 : STATE    : pointer args (only 2 of the 4 lanes used)
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+;;
+mk_global sha512_mb_x2_avx, function, internal
+align 32
+sha512_mb_x2_avx:
+	endbranch
+	; general registers preserved in outer calling routine
+	; outer calling routine saves all the XMM registers
+
+	sub	rsp, STACK_size
+
+	;; Load the pre-transposed incoming digest.
+	vmovdqa	a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE]
+	vmovdqa	b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE]
+	vmovdqa	c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE]
+	vmovdqa	d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE]
+	vmovdqa	e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE]
+	vmovdqa	f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE]
+	vmovdqa	g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE]
+	vmovdqa	h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE]
+
+	lea	TBL,[K512_2_MB]
+
+	;; load the address of each of the 2 message lanes
+	;; getting ready to transpose input onto stack
+	mov	inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ]
+	mov	inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ]
+
+	xor	IDX, IDX
+lloop:
+	xor	ROUND, ROUND
+
+	;; save old digest
+	vmovdqa	[rsp + _DIGEST + 0*SZ2], a
+	vmovdqa	[rsp + _DIGEST + 1*SZ2], b
+	vmovdqa	[rsp + _DIGEST + 2*SZ2], c
+	vmovdqa	[rsp + _DIGEST + 3*SZ2], d
+	vmovdqa	[rsp + _DIGEST + 4*SZ2], e
+	vmovdqa	[rsp + _DIGEST + 5*SZ2], f
+	vmovdqa	[rsp + _DIGEST + 6*SZ2], g
+	vmovdqa	[rsp + _DIGEST + 7*SZ2], h
+
+%assign i 0
+%rep 8
+	;; load up the shuffler for little-endian to big-endian format
+	vmovdqa	TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+	VMOVPD	TT0,[inp0+IDX+i*16] ;; double precision is 64 bits
+	VMOVPD	TT2,[inp1+IDX+i*16]
+
+	TRANSPOSE	TT0, TT2, TT1
+	vpshufb	TT0, TT0, TMP
+	vpshufb	TT1, TT1, TMP
+
+	ROUND_00_15	TT0,(i*2+0)
+	ROUND_00_15	TT1,(i*2+1)
+%assign i (i+1)
+%endrep
+
+;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes)
+	add	IDX, 8 * 16
+
+%assign i (i*4)
+
+	jmp	Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+	ROUND_16_XX	T1, i
+%assign i (i+1)
+%endrep
+
+	cmp	ROUND,ROUNDS
+	jb	Lrounds_16_xx
+
+	;; add old digest
+	vpaddq	a, a, [rsp + _DIGEST + 0*SZ2]
+	vpaddq	b, b, [rsp + _DIGEST + 1*SZ2]
+	vpaddq	c, c, [rsp + _DIGEST + 2*SZ2]
+	vpaddq	d, d, [rsp + _DIGEST + 3*SZ2]
+	vpaddq	e, e, [rsp + _DIGEST + 4*SZ2]
+	vpaddq	f, f, [rsp + _DIGEST + 5*SZ2]
+	vpaddq	g, g, [rsp + _DIGEST + 6*SZ2]
+	vpaddq	h, h, [rsp + _DIGEST + 7*SZ2]
+
+	sub	INP_SIZE, 1 ;; consumed one message block
+	jne	lloop
+
+	; write back to memory (state object) the transposed digest
+	vmovdqa	[STATE+0*SHA512_DIGEST_ROW_SIZE],a
+	vmovdqa	[STATE+1*SHA512_DIGEST_ROW_SIZE],b
+	vmovdqa	[STATE+2*SHA512_DIGEST_ROW_SIZE],c
+	vmovdqa	[STATE+3*SHA512_DIGEST_ROW_SIZE],d
+	vmovdqa	[STATE+4*SHA512_DIGEST_ROW_SIZE],e
+	vmovdqa	[STATE+5*SHA512_DIGEST_ROW_SIZE],f
+	vmovdqa	[STATE+6*SHA512_DIGEST_ROW_SIZE],g
+	vmovdqa	[STATE+7*SHA512_DIGEST_ROW_SIZE],h
+
+	; update input pointers
+	add	inp0, IDX
+	mov	[STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0
+	add	inp1, IDX
+	mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+	add	rsp, STACK_size
+
+	; outer calling routine restores XMM and other GP registers
+	ret
+
+section .data
+K512_2_MB:
+	dq 0x428a2f98d728ae22, 0x428a2f98d728ae22
+	dq 0x7137449123ef65cd, 0x7137449123ef65cd
+	dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+	dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+	dq 0x3956c25bf348b538, 0x3956c25bf348b538
+	dq 0x59f111f1b605d019, 0x59f111f1b605d019
+	dq 0x923f82a4af194f9b, 0x923f82a4af194f9b
+	dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+	dq 0xd807aa98a3030242, 0xd807aa98a3030242
+	dq 0x12835b0145706fbe, 0x12835b0145706fbe
+	dq 0x243185be4ee4b28c, 0x243185be4ee4b28c
+	dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+	dq 0x72be5d74f27b896f, 0x72be5d74f27b896f
+	dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+	dq 0x9bdc06a725c71235, 0x9bdc06a725c71235
+	dq 0xc19bf174cf692694, 0xc19bf174cf692694
+	dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+	dq 0xefbe4786384f25e3, 0xefbe4786384f25e3
+	dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+	dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+	dq 0x2de92c6f592b0275, 0x2de92c6f592b0275
+	dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+	dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+	dq 0x76f988da831153b5, 0x76f988da831153b5
+	dq 0x983e5152ee66dfab, 0x983e5152ee66dfab
+	dq 0xa831c66d2db43210, 0xa831c66d2db43210
+	dq 0xb00327c898fb213f, 0xb00327c898fb213f
+	dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+	dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+	dq 0xd5a79147930aa725, 0xd5a79147930aa725
+	dq 0x06ca6351e003826f, 0x06ca6351e003826f
+	dq 0x142929670a0e6e70, 0x142929670a0e6e70
+	dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+	dq 0x2e1b21385c26c926, 0x2e1b21385c26c926
+	dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+	dq 0x53380d139d95b3df, 0x53380d139d95b3df
+	dq 0x650a73548baf63de, 0x650a73548baf63de
+	dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+	dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+	dq 0x92722c851482353b, 0x92722c851482353b
+	dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+	dq 0xa81a664bbc423001, 0xa81a664bbc423001
+	dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+	dq 0xc76c51a30654be30, 0xc76c51a30654be30
+	dq 0xd192e819d6ef5218, 0xd192e819d6ef5218
+	dq 0xd69906245565a910, 0xd69906245565a910
+	dq 0xf40e35855771202a, 0xf40e35855771202a
+	dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+	dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+	dq 0x1e376c085141ab53, 0x1e376c085141ab53
+	dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+	dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+	dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+	dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+	dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+	dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+	dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+	dq 0x78a5636f43172f60, 0x78a5636f43172f60
+	dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+	dq 0x8cc702081a6439ec, 0x8cc702081a6439ec
+	dq 0x90befffa23631e28, 0x90befffa23631e28
+	dq 0xa4506cebde82bde9, 0xa4506cebde82bde9
+	dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+	dq 0xc67178f2e372532b, 0xc67178f2e372532b
+	dq 0xca273eceea26619c, 0xca273eceea26619c
+	dq 0xd186b8c721c0c207, 0xd186b8c721c0c207
+	dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+	dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+	dq 0x06f067aa72176fba, 0x06f067aa72176fba
+	dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+	dq 0x113f9804bef90dae, 0x113f9804bef90dae
+	dq 0x1b710b35131c471b, 0x1b710b35131c471b
+	dq 0x28db77f523047d84, 0x28db77f523047d84
+	dq 0x32caab7b40c72493, 0x32caab7b40c72493
+	dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+	dq 0x431d67c49c100d4c, 0x431d67c49c100d4c
+	dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+	dq 0x597f299cfc657e2a, 0x597f299cfc657e2a
+	dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+	dq 0x6c44198c4a475817, 0x6c44198c4a475817
+
+
+align 32
+; one from sha512_rorx
+; this does the big endian to little endian conversion
+; over a quad word
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+			 dq 0x1011121314151617, 0x18191a1b1c1d1e1f
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm
new file mode 100644
index 000000000..6c658023f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm
@@ -0,0 +1,424 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute SHA512 by-2 using SSE
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers:  rax         rdx             r8 r9 r10 r11
+;; Windows preserves:     rbx rcx     rsi rdi rbp               r12 r13 r14 r15
+;;
+;; Linux clobbers:    rax             rsi         r8 r9 r10 r11
+;; Linux preserves:       rbx rcx rdx     rdi rbp               r12 r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+%define SHA512_DIGEST_WORD_SIZE 8
+%define NUM_SHA512_DIGEST_WORDS 8
+%define SHA512_DIGEST_ROW_SIZE 8*4
+%define PTR_SZ 8
+%define _data_ptr_sha512 _data_ptr
+
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux definitions
+ %define arg1    rdi
+ %define arg2    rsi
+%else
+; Windows definitions
+ %define arg1    rcx
+ %define arg2    rdx
+%endif
+
+; Common definitions
+%define STATE    arg1
+%define INP_SIZE arg2
+
+%define IDX     rax
+%define ROUND   r8
+%define TBL     r11
+
+%define inp0 r9
+%define inp1 r10
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1  xmm14
+%define TMP xmm15
+
+%define SZ2	2*SHA512_DIGEST_WORD_SIZE	; Size of one vector register
+%define ROUNDS 80*SZ2
+
+; Define stack usage
+
+struc STACK
+_DATA:		resb	SZ2 * 16
+_DIGEST:	resb	SZ2 * NUM_SHA512_DIGEST_WORDS
+		resb	8 	; for alignment, must be odd multiple of 8
+endstruc
+
+%define MOVPD	movupd
+
+; transpose r0, r1, t0
+; Input looks like {r0 r1}
+; r0 = {a1 a0}
+; r1 = {b1 b0}
+;
+; output looks like
+; r0 = {b0, a0}
+; t0 = {b1, a1}
+
+%macro TRANSPOSE 3
+%define %%r0 %1
+%define %%r1 %2
+%define %%t0 %3
+	movapd  %%t0, %%r0		; t0 = a1 a0
+	shufpd	%%r0, %%r1, 00b		; r0 = b0 a0
+	shufpd	%%t0, %%r1, 11b		; t0 = b1 a1
+%endm
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORQ reg, imm, tmp
+; packed-rotate-right-double
+; does a rotate by doing two shifts and an or
+%macro PRORQ 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	movdqa	%%tmp, %%reg
+	psllq  	%%tmp, (64-(%%imm))
+	psrlq	%%reg, %%imm
+	por		%%reg, %%tmp
+%endmacro
+
+; PRORQ dst/src, amt
+%macro PRORQ 2
+	PRORQ	%1, %2, TMP
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i  %2
+	movdqa	a0, e		; sig1: a0 = e
+	movdqa	a1, e		; sig1: s1 = e
+	PRORQ	a0, (18-14)	; sig1: a0 = (e >> 4)
+
+	movdqa	a2, f		; ch: a2 = f
+	pxor	a2, g		; ch: a2 = f^g
+	pand	a2, e		; ch: a2 = (f^g)&e
+	pxor	a2, g		; a2 = ch
+
+	PRORQ	a1, 41		; sig1: a1 = (e >> 41)
+	movdqa	[SZ2*(%%i&0xf) + rsp],%%T1
+	paddq	%%T1,[TBL + ROUND]	; T1 = W + K
+	pxor	a0, e		; sig1: a0 = e ^ (e >> 5)
+	PRORQ	a0, 14		; sig1: a0 = (e >> 14) ^ (e >> 18)
+	paddq	h, a2		; h = h + ch
+	movdqa	a2, a		; sig0: a2 = a
+	PRORQ	a2, (34-28)	; sig0: a2 = (a >> 6)
+	paddq	h, %%T1		; h = h + ch + W + K
+	pxor	a0, a1		; a0 = sigma1
+	movdqa	a1, a		; sig0: a1 = a
+	movdqa	%%T1, a		; maj: T1 = a
+	PRORQ	a1, 39		; sig0: a1 = (a >> 39)
+	pxor	%%T1, c		; maj: T1 = a^c
+	add	ROUND, SZ2	; ROUND++
+	pand	%%T1, b		; maj: T1 = (a^c)&b
+	paddq	h, a0
+
+	paddq	d, h
+
+	pxor	a2, a		; sig0: a2 = a ^ (a >> 11)
+	PRORQ	a2, 28		; sig0: a2 = (a >> 28) ^ (a >> 34)
+	pxor	a2, a1		; a2 = sig0
+	movdqa	a1, a		; maj: a1 = a
+	pand	a1, c		; maj: a1 = a&c
+	por	a1, %%T1	; a1 = maj
+	paddq	h, a1		; h = h + ch + W + K + maj
+	paddq	h, a2		; h = h + ch + W + K + maj + sigma0
+
+	ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i  %2
+	movdqa	%%T1, [SZ2*((%%i-15)&0xf) + rsp]
+	movdqa	a1, [SZ2*((%%i-2)&0xf) + rsp]
+	movdqa	a0, %%T1
+	PRORQ	%%T1, 8-1
+	movdqa	a2, a1
+	PRORQ	a1, 61-19
+	pxor	%%T1, a0
+	PRORQ	%%T1, 1
+	pxor	a1, a2
+	PRORQ	a1, 19
+	psrlq	a0, 7
+	pxor	%%T1, a0
+	psrlq	a2, 6
+	pxor	a1, a2
+	paddq	%%T1, [SZ2*((%%i-16)&0xf) + rsp]
+	paddq	a1, [SZ2*((%%i-7)&0xf) + rsp]
+	paddq	%%T1, a1
+
+	ROUND_00_15 %%T1, %%i
+%endm
+
+;; void sha512_x2_sse(SHA512_MB_ARGS_X4 *args, uint64_t num_blocks);
+;; arg 1 : STATE    : pointer args (only 2 of the 4 lanes used)
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+;;
+mk_global sha512_mb_x2_sse, function, internal
+align 32
+sha512_mb_x2_sse:
+	endbranch
+	; general registers preserved in outer calling routine
+	; outer calling routine saves all the XMM registers
+	sub	rsp, STACK_size
+
+	;; Load the pre-transposed incoming digest.
+	movdqa	a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE]
+	movdqa	h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE]
+
+	lea	TBL,[K512_2_MB]
+
+	;; load the address of each of the 2 message lanes
+	;; getting ready to transpose input onto stack
+	mov	inp0,[STATE + _data_ptr_sha512  +0*PTR_SZ]
+	mov	inp1,[STATE + _data_ptr_sha512  +1*PTR_SZ]
+
+	xor	IDX, IDX
+lloop:
+	xor	ROUND, ROUND
+	;; save old digest
+	movdqa	[rsp + _DIGEST + 0*SZ2], a
+	movdqa	[rsp + _DIGEST + 1*SZ2], b
+	movdqa	[rsp + _DIGEST + 2*SZ2], c
+	movdqa	[rsp + _DIGEST + 3*SZ2], d
+	movdqa	[rsp + _DIGEST + 4*SZ2], e
+	movdqa	[rsp + _DIGEST + 5*SZ2], f
+	movdqa	[rsp + _DIGEST + 6*SZ2], g
+	movdqa	[rsp + _DIGEST + 7*SZ2], h
+
+%assign i 0
+%rep 8
+	;; load up the shuffler for little-endian to big-endian format
+	movdqa	TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+	MOVPD	TT0,[inp0+IDX+i*16] ;; double precision is 64 bits
+	MOVPD	TT2,[inp1+IDX+i*16]
+	TRANSPOSE	TT0, TT2, TT1
+	pshufb	TT0, TMP
+	pshufb	TT1, TMP
+	ROUND_00_15	TT0,(i*2+0)
+	ROUND_00_15	TT1,(i*2+1)
+%assign i (i+1)
+%endrep
+	add	IDX, 8 * 16 ;; increment by a message block
+
+%assign i (i*4)
+
+	jmp	Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+	ROUND_16_XX	T1, i
+%assign i (i+1)
+%endrep
+
+	cmp	ROUND,ROUNDS
+	jb	Lrounds_16_xx
+
+	;; add old digest
+	paddq	a, [rsp + _DIGEST + 0*SZ2]
+	paddq	b, [rsp + _DIGEST + 1*SZ2]
+	paddq	c, [rsp + _DIGEST + 2*SZ2]
+	paddq	d, [rsp + _DIGEST + 3*SZ2]
+	paddq	e, [rsp + _DIGEST + 4*SZ2]
+	paddq	f, [rsp + _DIGEST + 5*SZ2]
+	paddq	g, [rsp + _DIGEST + 6*SZ2]
+	paddq	h, [rsp + _DIGEST + 7*SZ2]
+
+	sub	INP_SIZE, 1  ;; unit is blocks
+	jne	lloop
+
+	; write back to memory (state object) the transposed digest
+	movdqa	[STATE + 0*SHA512_DIGEST_ROW_SIZE],a
+	movdqa	[STATE + 1*SHA512_DIGEST_ROW_SIZE],b
+	movdqa	[STATE + 2*SHA512_DIGEST_ROW_SIZE],c
+	movdqa	[STATE + 3*SHA512_DIGEST_ROW_SIZE],d
+	movdqa	[STATE + 4*SHA512_DIGEST_ROW_SIZE],e
+	movdqa	[STATE + 5*SHA512_DIGEST_ROW_SIZE],f
+	movdqa	[STATE + 6*SHA512_DIGEST_ROW_SIZE],g
+	movdqa	[STATE + 7*SHA512_DIGEST_ROW_SIZE],h
+
+	; update input pointers
+	add	inp0, IDX
+	mov	[STATE + _data_ptr_sha512  + 0*PTR_SZ], inp0
+	add	inp1, IDX
+	mov	[STATE + _data_ptr_sha512  + 1*PTR_SZ], inp1
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+	add	rsp, STACK_size
+	ret
+
+section .data
+align 64
+mk_global K512_2_MB, data, internal
+K512_2_MB:
+	dq	0x428a2f98d728ae22, 0x428a2f98d728ae22
+	dq	0x7137449123ef65cd, 0x7137449123ef65cd
+	dq	0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+	dq	0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+	dq	0x3956c25bf348b538, 0x3956c25bf348b538
+	dq	0x59f111f1b605d019, 0x59f111f1b605d019
+	dq	0x923f82a4af194f9b, 0x923f82a4af194f9b
+	dq	0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+	dq	0xd807aa98a3030242, 0xd807aa98a3030242
+	dq	0x12835b0145706fbe, 0x12835b0145706fbe
+	dq	0x243185be4ee4b28c, 0x243185be4ee4b28c
+	dq	0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+	dq	0x72be5d74f27b896f, 0x72be5d74f27b896f
+	dq	0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+	dq	0x9bdc06a725c71235, 0x9bdc06a725c71235
+	dq	0xc19bf174cf692694, 0xc19bf174cf692694
+	dq	0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+	dq	0xefbe4786384f25e3, 0xefbe4786384f25e3
+	dq	0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+	dq	0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+	dq	0x2de92c6f592b0275, 0x2de92c6f592b0275
+	dq	0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+	dq	0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+	dq	0x76f988da831153b5, 0x76f988da831153b5
+	dq	0x983e5152ee66dfab, 0x983e5152ee66dfab
+	dq	0xa831c66d2db43210, 0xa831c66d2db43210
+	dq	0xb00327c898fb213f, 0xb00327c898fb213f
+	dq	0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+	dq	0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+	dq	0xd5a79147930aa725, 0xd5a79147930aa725
+	dq	0x06ca6351e003826f, 0x06ca6351e003826f
+	dq	0x142929670a0e6e70, 0x142929670a0e6e70
+	dq	0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+	dq	0x2e1b21385c26c926, 0x2e1b21385c26c926
+	dq	0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+	dq	0x53380d139d95b3df, 0x53380d139d95b3df
+	dq	0x650a73548baf63de, 0x650a73548baf63de
+	dq	0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+	dq	0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+	dq	0x92722c851482353b, 0x92722c851482353b
+	dq	0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+	dq	0xa81a664bbc423001, 0xa81a664bbc423001
+	dq	0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+	dq	0xc76c51a30654be30, 0xc76c51a30654be30
+	dq	0xd192e819d6ef5218, 0xd192e819d6ef5218
+	dq	0xd69906245565a910, 0xd69906245565a910
+	dq	0xf40e35855771202a, 0xf40e35855771202a
+	dq	0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+	dq	0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+	dq	0x1e376c085141ab53, 0x1e376c085141ab53
+	dq	0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+	dq	0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+	dq	0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+	dq	0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+	dq	0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+	dq	0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+	dq	0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+	dq	0x78a5636f43172f60, 0x78a5636f43172f60
+	dq	0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+	dq	0x8cc702081a6439ec, 0x8cc702081a6439ec
+	dq	0x90befffa23631e28, 0x90befffa23631e28
+	dq	0xa4506cebde82bde9, 0xa4506cebde82bde9
+	dq	0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+	dq	0xc67178f2e372532b, 0xc67178f2e372532b
+	dq	0xca273eceea26619c, 0xca273eceea26619c
+	dq	0xd186b8c721c0c207, 0xd186b8c721c0c207
+	dq	0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+	dq	0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+	dq	0x06f067aa72176fba, 0x06f067aa72176fba
+	dq	0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+	dq	0x113f9804bef90dae, 0x113f9804bef90dae
+	dq	0x1b710b35131c471b, 0x1b710b35131c471b
+	dq	0x28db77f523047d84, 0x28db77f523047d84
+	dq	0x32caab7b40c72493, 0x32caab7b40c72493
+	dq	0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+	dq	0x431d67c49c100d4c, 0x431d67c49c100d4c
+	dq	0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+	dq	0x597f299cfc657e2a, 0x597f299cfc657e2a
+	dq	0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+	dq	0x6c44198c4a475817, 0x6c44198c4a475817
+
+PSHUFFLE_BYTE_FLIP_MASK: dq	0x0001020304050607, 0x08090a0b0c0d0e0f
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm
new file mode 100644
index 000000000..0058f33a6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm
@@ -0,0 +1,487 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute quad SHA512 using AVX2
+;; use YMMs to tackle the larger digest size
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 32 bytes before call
+;; Windows clobbers:  rax rbx     rdx             r8 r9 r10 r11 r12
+;; Windows preserves:         rcx     rsi rdi rbp                   r13 r14 r15
+;;
+;; Linux clobbers:    rax rbx rcx     rsi         r8 r9 r10 r11 r12
+;; Linux preserves:           rcx rdx     rdi rbp                   r13 r14 r15
+;;
+;; clobbers ymm0-15
+
+%define SHA512_DIGEST_WORD_SIZE 8
+%define NUM_SHA512_DIGEST_WORDS 8
+%define SHA512_DIGEST_ROW_SIZE 8*4
+%define PTR_SZ 8
+%define _data_ptr_sha512 _data_ptr
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1        rdi
+%define arg2        rsi
+%else
+; Windows register definitions
+%define arg1        rcx
+%define arg2        rdx
+%endif
+
+; Common definitions
+%define STATE    arg1
+%define INP_SIZE arg2
+
+%define IDX     rax
+%define ROUND   rbx
+%define TBL      r8
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+
+%define a ymm0
+%define b ymm1
+%define c ymm2
+%define d ymm3
+%define e ymm4
+%define f ymm5
+%define g ymm6
+%define h ymm7
+
+%define a0 ymm8
+%define a1 ymm9
+%define a2 ymm10
+
+%define TT0 ymm14
+%define TT1 ymm13
+%define TT2 ymm12
+%define TT3 ymm11
+%define TT4 ymm10
+%define TT5 ymm9
+
+%define T1  ymm14
+%define TMP ymm15
+
+%define SZ4	4*SHA512_DIGEST_WORD_SIZE	; Size of one vector register
+%define ROUNDS 80*SZ4
+
+; Define stack usage
+
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESZ mod 32 must be 32-8 = 24
+struc stack_frame
+  .data		resb	16*SZ4
+  .digest	resb	NUM_SHA512_DIGEST_WORDS*SZ4
+  .align	resb	24
+endstruc
+
+%define _DIGEST stack_frame.digest
+
+%define VMOVPD	vmovupd
+
+; operates on YMMs
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d1 d0 c1 c0 b1 b0 a1 a0}
+; r1 = {d3 d2 c3 c2 b3 b2 a3 a2}
+; r0 = {d5 d4 c5 c4 b5 b4 a5 a4}
+; r3 = {d7 d6 c7 c6 b7 b6 a7 a6}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+	; vshufps does not cross the mid-way boundary and hence is cheaper
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+
+	vshufps	%%t1, %%r2, %%r3, 0x44	; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+
+	vperm2f128 %%r1, %%r0, %%r2, 0x20; r1 = {d3 d2 c3 c2 b3 b2 a3 a2}
+
+	vperm2f128 %%r3, %%r0, %%r2, 0x31; r3 = {d7 d6 c7 c6 b7 b6 a7 a6}
+
+	vperm2f128 %%r0, %%t0, %%t1, 0x31; r0 = {d5 d4 c5 c4 b5 b4 a5 a4}
+
+	; now ok to clobber t0
+	vperm2f128 %%t0, %%t0, %%t1, 0x20; t0 = {d1 d0 c1 c0 b1 b0 a1 a0}
+
+%endmacro
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORQ reg, imm, tmp
+; packed-rotate-right-double
+; does a rotate by doing two shifts and an or
+%macro PRORQ 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpsllq	%%tmp, %%reg, (64-(%%imm))
+	vpsrlq	%%reg, %%reg, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORQ_nd reg, imm, tmp, src
+%macro PRORQ_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	vpsllq	%%tmp, %%src, (64-(%%imm))
+	vpsrlq	%%reg, %%src, %%imm
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+; PRORQ dst/src, amt
+%macro PRORQ 2
+	PRORQ	%1, %2, TMP
+%endmacro
+
+; PRORQ_nd dst, src, amt
+%macro PRORQ_nd 3
+	PRORQ_nd	%1, %3, TMP, %2
+%endmacro
+
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i  %2
+	PRORQ_nd a0, e, (18-14)	; sig1: a0 = (e >> 4)
+
+	vpxor	a2, f, g	; ch: a2 = f^g
+	vpand	a2, a2, e	; ch: a2 = (f^g)&e
+	vpxor	a2, a2, g	; a2 = ch
+
+	PRORQ_nd a1, e, 41	; sig1: a1 = (e >> 41)
+	vmovdqa	[SZ4*(%%i&0xf) + rsp],%%T1
+	vpaddq	%%T1,%%T1,[TBL + ROUND]	; T1 = W + K
+	vpxor	a0, a0, e	; sig1: a0 = e ^ (e >> 5)
+	PRORQ	a0, 14		; sig1: a0 = (e >> 14) ^ (e >> 18)
+	vpaddq	h, h, a2	; h = h + ch
+	PRORQ_nd a2, a, (34-28)	; sig0: a2 = (a >> 6)
+	vpaddq	h, h, %%T1	; h = h + ch + W + K
+	vpxor	a0, a0, a1	; a0 = sigma1
+	vmovdqa	%%T1, a		; maj: T1 = a
+	PRORQ_nd a1, a, 39	; sig0: a1 = (a >> 39)
+	vpxor	%%T1, %%T1, c	; maj: T1 = a^c
+	add	ROUND, SZ4 ; ROUND++
+	vpand	%%T1, %%T1, b	; maj: T1 = (a^c)&b
+	vpaddq	h, h, a0
+
+	vpaddq	d, d, h
+
+	vpxor	a2, a2, a	; sig0: a2 = a ^ (a >> 11)
+	PRORQ	a2, 28		; sig0: a2 = (a >> 28) ^ (a >> 34)
+	vpxor	a2, a2, a1	; a2 = sig0
+	vpand	a1, a, c	; maj: a1 = a&c
+	vpor	a1, a1, %%T1	; a1 = maj
+	vpaddq	h, h, a1	; h = h + ch + W + K + maj
+	vpaddq	h, h, a2	; h = h + ch + W + K + maj + sigma0
+	ROTATE_ARGS
+
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i  %2
+	vmovdqa	%%T1, [SZ4*((%%i-15)&0xf) + rsp]
+	vmovdqa	a1, [SZ4*((%%i-2)&0xf) + rsp]
+	vmovdqa	a0, %%T1
+	PRORQ	%%T1, 8-1
+	vmovdqa	a2, a1
+	PRORQ	a1, 61-19
+	vpxor	%%T1, %%T1, a0
+	PRORQ	%%T1, 1
+	vpxor	a1, a1, a2
+	PRORQ	a1, 19
+	vpsrlq	a0, a0, 7
+	vpxor	%%T1, %%T1, a0
+	vpsrlq	a2, a2, 6
+	vpxor	a1, a1, a2
+	vpaddq	%%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp]
+	vpaddq	a1, a1, [SZ4*((%%i-7)&0xf) + rsp]
+	vpaddq	%%T1, %%T1, a1
+
+	ROUND_00_15 %%T1, %%i
+
+%endm
+
+
+;; void sha512_mb_x4_avx2(SHA512_MB_ARGS_X4 *STATE, const int INP_SIZE)
+;; arg 1 : STATE    : pointer to input data
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+mk_global sha512_mb_x4_avx2, function, internal
+align 32
+sha512_mb_x4_avx2:
+	endbranch
+	; general registers preserved in outer calling routine
+	; outer calling routine saves all the XMM registers
+
+	sub	rsp, stack_frame_size
+
+     ;; Load the pre-transposed incoming digest.
+	vmovdqu a, [STATE+ 0*SHA512_DIGEST_ROW_SIZE]
+	vmovdqu b, [STATE+ 1*SHA512_DIGEST_ROW_SIZE]
+	vmovdqu c, [STATE+ 2*SHA512_DIGEST_ROW_SIZE]
+	vmovdqu d, [STATE+ 3*SHA512_DIGEST_ROW_SIZE]
+	vmovdqu e, [STATE+ 4*SHA512_DIGEST_ROW_SIZE]
+	vmovdqu f, [STATE+ 5*SHA512_DIGEST_ROW_SIZE]
+	vmovdqu g, [STATE+ 6*SHA512_DIGEST_ROW_SIZE]
+	vmovdqu h, [STATE+ 7*SHA512_DIGEST_ROW_SIZE]
+
+
+	lea	TBL,[K512_4_MB]
+
+	;; load the address of each of the MAX_LANES (4)  message lanes
+	;; getting ready to transpose input onto stack
+	mov	inp0,[STATE + _data_ptr_sha512 + 0*PTR_SZ]
+	mov	inp1,[STATE + _data_ptr_sha512 + 1*PTR_SZ]
+	mov	inp2,[STATE + _data_ptr_sha512 + 2*PTR_SZ]
+	mov	inp3,[STATE + _data_ptr_sha512 + 3*PTR_SZ]
+
+	xor	IDX, IDX
+lloop:
+	xor	ROUND, ROUND
+
+	;; save old digest
+	vmovdqa	[rsp + _DIGEST + 0*SZ4], a
+	vmovdqa	[rsp + _DIGEST + 1*SZ4], b
+	vmovdqa	[rsp + _DIGEST + 2*SZ4], c
+	vmovdqa	[rsp + _DIGEST + 3*SZ4], d
+	vmovdqa	[rsp + _DIGEST + 4*SZ4], e
+	vmovdqa	[rsp + _DIGEST + 5*SZ4], f
+	vmovdqa	[rsp + _DIGEST + 6*SZ4], g
+	vmovdqa	[rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+	;; load up the shuffler for little-endian to big-endian format
+	vmovdqa	TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+	VMOVPD	TT2,[inp0+IDX+i*32]
+	VMOVPD	TT1,[inp1+IDX+i*32]
+	VMOVPD	TT4,[inp2+IDX+i*32]
+	VMOVPD	TT3,[inp3+IDX+i*32]
+	TRANSPOSE	TT2, TT1, TT4, TT3, TT0, TT5
+	vpshufb	TT0, TT0, TMP
+	vpshufb	TT1, TT1, TMP
+	vpshufb	TT2, TT2, TMP
+	vpshufb	TT3, TT3, TMP
+	ROUND_00_15	TT0,(i*4+0)
+	ROUND_00_15	TT1,(i*4+1)
+	ROUND_00_15	TT2,(i*4+2)
+	ROUND_00_15	TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes)
+	add	IDX, 4 * 32
+
+%assign i (i*4)
+
+	jmp	Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+	ROUND_16_XX	T1, i
+%assign i (i+1)
+%endrep
+
+	cmp	ROUND,ROUNDS
+	jb	Lrounds_16_xx
+
+	;; add old digest
+	vpaddq	a, a, [rsp + _DIGEST + 0*SZ4]
+	vpaddq	b, b, [rsp + _DIGEST + 1*SZ4]
+	vpaddq	c, c, [rsp + _DIGEST + 2*SZ4]
+	vpaddq	d, d, [rsp + _DIGEST + 3*SZ4]
+	vpaddq	e, e, [rsp + _DIGEST + 4*SZ4]
+	vpaddq	f, f, [rsp + _DIGEST + 5*SZ4]
+	vpaddq	g, g, [rsp + _DIGEST + 6*SZ4]
+	vpaddq	h, h, [rsp + _DIGEST + 7*SZ4]
+
+	sub	INP_SIZE, 1 ;; consumed one message block
+	jne	lloop
+
+	; write back to memory (state object) the transposed digest
+	vmovdqu	[STATE+ 0*SHA512_DIGEST_ROW_SIZE ],a
+	vmovdqu	[STATE+ 1*SHA512_DIGEST_ROW_SIZE ],b
+	vmovdqu	[STATE+ 2*SHA512_DIGEST_ROW_SIZE ],c
+	vmovdqu	[STATE+ 3*SHA512_DIGEST_ROW_SIZE ],d
+	vmovdqu	[STATE+ 4*SHA512_DIGEST_ROW_SIZE ],e
+	vmovdqu	[STATE+ 5*SHA512_DIGEST_ROW_SIZE ],f
+	vmovdqu	[STATE+ 6*SHA512_DIGEST_ROW_SIZE ],g
+	vmovdqu	[STATE+ 7*SHA512_DIGEST_ROW_SIZE ],h
+
+	;; update input data pointers
+	add inp0, IDX
+	mov	[STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0
+	add inp1, IDX
+	mov	[STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1
+	add inp2, IDX
+	mov	[STATE + _data_ptr_sha512 + 2*PTR_SZ], inp2
+	add inp3, IDX
+	mov	[STATE + _data_ptr_sha512 + 3*PTR_SZ], inp3
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+
+	add rsp, stack_frame_size
+
+	; outer calling routine restores XMM and other GP registers
+	ret
+
+section .data
+align 64
+K512_4_MB:
+	dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22
+	dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd
+	dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+	dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+	dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538
+	dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019
+	dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b
+	dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+	dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242
+	dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe
+	dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c
+	dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+	dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f
+	dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+	dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235
+	dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694
+	dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+	dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3
+	dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+	dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+	dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275
+	dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+	dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+	dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5
+	dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab
+	dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210
+	dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f
+	dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+	dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+	dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725
+	dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f
+	dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70
+	dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+	dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926
+	dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+	dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df
+	dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de
+	dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+	dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+	dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b
+	dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+	dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001
+	dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+	dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30
+	dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218
+	dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910
+	dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a
+	dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+	dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+	dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53
+	dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+	dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+	dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+	dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+	dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+	dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+	dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+	dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60
+	dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+	dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec
+	dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28
+	dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9
+	dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+	dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b
+	dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c
+	dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207
+	dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+	dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+	dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba
+	dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+	dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae
+	dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b
+	dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84
+	dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493
+	dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+	dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c
+	dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+	dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a
+	dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+	dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817
+
+align 32
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+			 dq 0x1011121314151617, 0x18191a1b1c1d1e1f
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm
new file mode 100644
index 000000000..a93fecb1b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm
@@ -0,0 +1,644 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute quad SHA512 using AVX512
+;; use ZMMs to tackle the larger digest size
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; zmm0-31
+;; Stack must be aligned to 32 bytes before call
+;; Windows clobbers:  rax rbx     rdx     rdi  rbp r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves:         rcx     rsi
+;;
+;; Linux clobbers:    rax rbx rcx     rsi     rbp r8 r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves:               rdx     rdi
+;;
+;; clobbers zmm0-31
+
+%define APPEND(a,b) a %+ b
+
+%ifidn __OUTPUT_FORMAT__, win64
+   %define arg1 rcx	; arg0 preserved
+   %define arg2 rdx	; arg1
+   %define reg3 r8	; arg2 preserved
+   %define reg4 r9	; arg3
+   %define var1 rdi	; usable
+   %define var2 rsi
+   %define local_func_decl(func_name) global func_name
+ %else
+   %define arg1 rdi	; arg0
+   %define arg2 rsi	; arg1
+   %define var2 rdx	; arg2
+   %define var1 rcx	; arg3 usable
+   %define local_func_decl(func_name) mk_global func_name, function, internal
+%endif
+
+%define state    arg1
+%define num_blks arg2
+
+%define	IN	(state + _data_ptr)
+%define DIGEST	state
+%define SIZE	num_blks
+
+%define IDX  var1
+%define TBL  r8
+
+%define VMOVDQ32  vmovdqu32
+
+%define SHA512_DIGEST_WORD_SIZE 8
+%define NUM_SHA512_DIGEST_WORDS 8
+%define SHA512_DIGEST_ROW_SIZE 8*8
+%define PTR_SZ 8
+%define _data_ptr_sha512 _data_ptr
+
+%define NUM_LANES          8
+%define SZ                 8
+%define SZ8	           8 * SZ
+%define DIGEST_SZ          8 * SZ8
+%define DIGEST_SAVE	   NUM_LANES * DIGEST_SZ
+%define RSP_SAVE           1*8
+
+; Define Stack Layout
+START_FIELDS
+;;;     name            size            align
+FIELD	_DIGEST_SAVE,	NUM_LANES*8*64,	64
+FIELD	_RSP,		8,	        8
+%assign STACK_SPACE	_FIELD_OFFSET
+
+
+%define inp0	r9
+%define inp1	r10
+%define inp2	r11
+%define inp3	r12
+%define inp4	r13
+%define inp5	r14
+%define inp6	r15
+%define inp7	rax
+
+%define A	zmm0
+%define B	zmm1
+%define C	zmm2
+%define D	zmm3
+%define E	zmm4
+%define F	zmm5
+%define G	zmm6
+%define H	zmm7
+%define T1	zmm8
+%define TMP0	zmm9
+%define TMP1	zmm10
+%define TMP2	zmm11
+%define TMP3	zmm12
+%define TMP4	zmm13
+%define TMP5	zmm14
+%define TMP6	zmm15
+
+
+%define W0	zmm16
+%define W1	zmm17
+%define W2	zmm18
+%define W3	zmm19
+%define W4	zmm20
+%define W5	zmm21
+%define W6	zmm22
+%define W7	zmm23
+%define W8	zmm24
+%define W9	zmm25
+%define W10	zmm26
+%define W11	zmm27
+%define W12	zmm28
+%define W13	zmm29
+%define W14	zmm30
+%define W15	zmm31
+
+; from sha256_fips180-2.pdf
+; define rotates for Sigma function for main loop steps
+%define BIG_SIGMA_0_0 28	; Sigma0
+%define BIG_SIGMA_0_1 34
+%define BIG_SIGMA_0_2 39
+%define BIG_SIGMA_1_0 14	; Sigma1
+%define BIG_SIGMA_1_1 18
+%define BIG_SIGMA_1_2 41
+
+; define rotates for Sigma function for scheduling steps
+
+%define SMALL_SIGMA_0_0 1	; sigma0
+%define SMALL_SIGMA_0_1 8
+%define SMALL_SIGMA_0_2 7
+%define SMALL_SIGMA_1_0 19	; sigma1
+%define SMALL_SIGMA_1_1 61
+%define SMALL_SIGMA_1_2 6
+
+%define SHA_MAX_ROUNDS 80
+%define SHA_ROUNDS_LESS_16 (SHA_MAX_ROUNDS - 16)
+
+%macro TRANSPOSE8 12
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+%define %%PERM_INDEX1 %11
+%define %%PERM_INDEX2 %12
+
+
+; each x(i) is 32 bits, 16 * 32 = 512 ==> a full digest length, 32 single precision quantities
+; r0  = {a7 a6 a5 a4   a3 a2 a1 a0}
+; r1  = {b7 b6 b5 b4   b3 b2 b1 b0}
+; r2  = {c7 c6 c5 c4   c3 c2 c1 c0}
+; r3  = {d7 d6 d5 d4   d3 d2 d1 d0}
+; r4  = {e7 e6 e5 e4   e3 e2 e1 e0}
+; r5  = {f7 f6 f5 f4   f3 f2 f1 f0}
+; r6  = {g7 g6 g5 g4   g3 g2 g1 g0}
+; r7  = {h7 h6 h5 h4   h3 h2 h1 h0}
+
+        ;; ;;;  will not get clobbered
+        vmovdqa32 %%PERM_INDEX1, [TRANSPOSE8_PERM_INDEX_1] ;  temp
+        vmovdqa32 %%PERM_INDEX2, [TRANSPOSE8_PERM_INDEX_2]  ; temp
+
+        ; process top half (r0..r3) {a...d}
+        vshufpd	%%t0, %%r0, %%r1, 0x00	; t0 = {b6 a6 b4 a4   b2 a2 b0 a0}
+        vshufpd	%%r0, %%r0, %%r1, 0xFF	; r0 = {b7 a7 b5 a5   b3 a3 b1 a1}
+        vshufpd	%%t1, %%r2, %%r3, 0x00	; t1 = {d6 c6 d4 c4   d2 c2 d0 c0}
+        vshufpd	%%r2, %%r2, %%r3, 0xFF	; r2 = {d7 c7 d5 c5   d3 c3 d1 c1}
+
+        vmovdqa32   %%r1, %%t0		     ; r1 and r3 free
+        vpermt2q    %%r1, %%PERM_INDEX1,%%t1   ; r1 = {d4 c4 b4 a4   d0 c0 b0 a0}
+        vpermt2q    %%t0, %%PERM_INDEX2,%%t1   ; t0 = {d6 c6 b6 a6   d2 c2 b2 a2}
+
+        vmovdqa32   %%t1, %%r0		       ; t1 and r3 free
+        vpermt2q    %%t1, %%PERM_INDEX1,%%r2   ; t1 = {d5 c5 b5 a5   d1 c1 b1 a1}
+        vpermt2q    %%r0, %%PERM_INDEX2,%%r2   ; r0 = {d7 c7 b7 a7   d3 c3 b3 a3}
+
+        ;; Likewise for top half ; r2 and r3 free
+        vshufpd	%%r2, %%r4, %%r5, 0x00	; r2 = {f6 e6 f4 e4   f2 e2 f0 e0}
+        vshufpd	%%r4, %%r4, %%r5, 0xFF	; r4 = {f7 e7 f5 e5   f3 e3 f1 e1}
+        vshufpd	%%r3, %%r6, %%r7, 0x00	; r3 = {h6 g6 h4 g4   h2 g2 h0 g0}
+        vshufpd	%%r6, %%r6, %%r7, 0xFF	; r6 = {h7 g7 h5 g5   h3 g3 h1 g1}
+
+        vmovdqa32   %%r5, %%r2		     ; r5 and r7 free
+        vpermt2q    %%r5, %%PERM_INDEX1,%%r3   ; r5 = {h4 g4 f4 e4   h0 g0 f0 e0}
+        vpermt2q    %%r2, %%PERM_INDEX2,%%r3   ; r2 = {h6 g6 f6 e6   h2 g2 f2 e2}
+
+        vmovdqa32   %%r7, %%r4
+        vpermt2q    %%r7, %%PERM_INDEX1,%%r6   ; r7 = {h5 g5 f5 e5   h1 g1 f1 e1}
+        vpermt2q    %%r4, %%PERM_INDEX2,%%r6   ; r4 = {h7 g7 f7 e7   h3 g3 f3 e3}
+
+;;;  free r3, r6
+        vshuff64x2  %%r6, %%t0, %%r2, 0xEE ; r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
+        vshuff64x2  %%r2, %%t0, %%r2, 0x44 ; r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
+
+;;; t0 and r3 free
+        vshuff64x2  %%r3, %%r0, %%r4, 0x44 ; r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
+        vshuff64x2  %%t0, %%r0, %%r4, 0xEE ; t0 = {h7 g7 f7 e7   d7 c7 b7 a7}
+
+        vshuff64x2  %%r4, %%r1, %%r5, 0xEE ; r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
+        vshuff64x2  %%r0, %%r1, %%r5, 0x44 ; r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
+
+
+        vshuff64x2  %%r5, %%t1, %%r7, 0xEE ; r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
+        vshuff64x2  %%r1, %%t1, %%r7, 0x44 ; r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
+
+        ;;  will re-order input to avoid move
+        ;vmovdqa32   %%r7, %%t0
+
+        ; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+        ; r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
+        ; r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
+        ; r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
+        ; r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
+        ; r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
+        ; r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
+        ; r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
+        ; temp
+        ; r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ H
+%xdefine H G
+%xdefine G F
+%xdefine F E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+
+
+;;  CH(A, B, C) = (A&B) ^ (~A&C)
+;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
+;; SIGMA0 = ROR_28  ^ ROR_34 ^ ROR_39
+;; SIGMA1 = ROR_14  ^ ROR_18 ^ ROR_41
+;; sigma0 = ROR_1  ^ ROR_8 ^ SHR_7
+;; sigma1 = ROR_19 ^ ROR_61 ^ SHR_6
+
+;; Main processing loop per round
+;;  equivalent to %macro ROUND_00_15 2
+%macro PROCESS_LOOP 2
+%define %%WT	%1
+%define %%ROUND	%2
+        ;; T1 = H + BIG_SIGMA_1(E) + CH(E, F, G) + Kt + Wt
+        ;; T2 = BIG_SIGMA_0(A) + MAJ(A, B, C)
+        ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+        ;; H becomes T2, then add T1 for A
+        ;; D becomes D + T1 for E
+
+        vpaddq		T1, H, TMP3		; T1 = H + Kt
+        vmovdqa32	TMP0, E
+        ;; compute BIG_SIGMA_1(E)
+        vprorq		TMP1, E, BIG_SIGMA_1_0 		; ROR_14(E)
+        vprorq		TMP2, E, BIG_SIGMA_1_1		; ROR_18(E)
+        vprorq		TMP3, E, BIG_SIGMA_1_2		; ROR_41(E)
+        vpternlogq	TMP1, TMP2, TMP3, 0x96	; TMP1 = BIG_SIGMA_1(E)
+        vpternlogq	TMP0, F, G, 0xCA	; TMP0 = CH(E,F,G)
+        vpaddq		T1, T1, %%WT		; T1 = T1 + Wt
+        vpaddq		T1, T1, TMP0		; T1 = T1 + CH(E,F,G)
+        vpaddq		T1, T1, TMP1		; T1 = T1 + BIG_SIGMA_1(E)
+        vpaddq		D, D, T1		; D = D + T1
+        vprorq		H, A, BIG_SIGMA_0_0     ;ROR_28(A)
+        vprorq		TMP2, A, BIG_SIGMA_0_1  ;ROR_34(A)
+        vprorq		TMP3, A, BIG_SIGMA_0_2	;ROR_39(A)
+        vmovdqa32	TMP0, A
+        vpternlogq	TMP0, B, C, 0xE8	; TMP0 = MAJ(A,B,C)
+        vpternlogq	H, TMP2, TMP3, 0x96	; H(T2) = BIG_SIGMA_0(A)
+        vpaddq		H, H, TMP0		; H(T2) = BIG_SIGMA_0(A) + MAJ(A,B,C)
+        vpaddq		H, H, T1		; H(A) = H(T2) + T1
+        vmovdqa32	TMP3, [TBL + ((%%ROUND+1)*64)]	; Next Kt
+
+        ;; Rotate the args A-H (rotation of names associated with regs)
+        ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_79 4
+%define %%WT	%1
+%define %%WTp1	%2
+%define %%WTp9	%3
+%define %%WTp14	%4
+        vprorq		TMP4, %%WTp14, SMALL_SIGMA_1_0 	; ROR_19(Wt-2)
+        vprorq		TMP5, %%WTp14, SMALL_SIGMA_1_1 	; ROR_61(Wt-2)
+        vpsrlq		TMP6, %%WTp14, SMALL_SIGMA_1_2 	; SHR_6(Wt-2)
+        vpternlogq	TMP4, TMP5, TMP6, 0x96	        ; TMP4 = sigma_1(Wt-2)
+
+        vpaddq		%%WT, %%WT, TMP4	; Wt = Wt-16 + sigma_1(Wt-2)
+        vpaddq		%%WT, %%WT, %%WTp9	; Wt = Wt-16 + sigma_1(Wt-2) + Wt-7
+
+        vprorq		TMP4, %%WTp1, SMALL_SIGMA_0_0 	; ROR_1(Wt-15)
+        vprorq		TMP5, %%WTp1, SMALL_SIGMA_0_1 	; ROR_8(Wt-15)
+        vpsrlq		TMP6, %%WTp1, SMALL_SIGMA_0_2 	; SHR_7(Wt-15)
+        vpternlogq	TMP4, TMP5, TMP6, 0x96	        ; TMP4 = sigma_0(Wt-15)
+
+        vpaddq		%%WT, %%WT, TMP4	; Wt = Wt-16 + sigma_1(Wt-2) +
+                                                ; Wt-7 + sigma_0(Wt-15) +
+
+%endmacro
+
+align 64
+
+; void sha512_mb_x8_avx512(SHA512_MB_ARGS_X8, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+local_func_decl(sha512_mb_x8_avx512)
+sha512_mb_x8_avx512:
+	endbranch
+        mov	rax, rsp
+        sub     rsp, STACK_SPACE
+        and	rsp, ~63	; align stack to multiple of 64
+        mov	[rsp + _RSP], rax
+	lea	TBL,[TABLE]
+
+    ;; Initialize digests
+    vmovups	A,    [DIGEST + 0*8*8]
+    vmovups	B,    [DIGEST + 1*8*8]
+    vmovups	C,    [DIGEST + 2*8*8]
+    vmovups	D,    [DIGEST + 3*8*8]
+    vmovups	E,    [DIGEST + 4*8*8]
+    vmovups	F,    [DIGEST + 5*8*8]
+    vmovups	G,    [DIGEST + 6*8*8]
+    vmovups	H,    [DIGEST + 7*8*8]
+
+    xor	IDX, IDX
+    ;; Read in input data address, saving them in registers because
+    ;; they will serve as variables, which we shall keep incrementing
+    mov	inp0, [IN + 0*8]
+    mov	inp1, [IN + 1*8]
+    mov	inp2, [IN + 2*8]
+    mov	inp3, [IN + 3*8]
+    mov	inp4, [IN + 4*8]
+    mov	inp5, [IN + 5*8]
+    mov	inp6, [IN + 6*8]
+    mov	inp7, [IN + 7*8]
+
+lloop:
+
+    ;;  first half of 1024 (need to transpose before use)
+    vmovups	W0,[inp0 + IDX ]
+    vmovups	W1,[inp1 + IDX ]
+    vmovups	W2,[inp2 + IDX ]
+    vmovups	W3,[inp3 + IDX ]
+    vmovups	W4,[inp4 + IDX ]
+    vmovups	W5,[inp5 + IDX ]
+    vmovups	W6,[inp6 + IDX ]
+    vmovups	TMP0,[inp7 + IDX ]
+    TRANSPOSE8  W0, W1, W2, W3, W4, W5, W6, TMP0,  W7, TMP1, TMP2, TMP3
+    ;;  second half of 1024 (need to transpose before use)
+    vmovups     W8,[inp0  + SZ8 + IDX ]
+    vmovups	W9,[inp1  + SZ8 + IDX ]
+    vmovups	W10,[inp2 + SZ8 + IDX ]
+    vmovups	W11,[inp3 + SZ8 + IDX ]
+    vmovups	W12,[inp4 + SZ8 + IDX ]
+    vmovups	W13,[inp5 + SZ8 + IDX ]
+    vmovups	W14,[inp6 + SZ8 + IDX ]
+    vmovups	TMP0,[inp7 + SZ8 + IDX ]
+    TRANSPOSE8  W8, W9, W10, W11, W12, W13, W14, TMP0,  W15, TMP1, TMP2, TMP3
+
+    vmovdqa32	TMP2, [PSHUFFLE_BYTE_FLIP_MASK]
+
+    vmovdqa32	TMP3, [TBL]	; First K
+
+        ; Save digests for later addition
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*0], A
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*1], B
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*2], C
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*3], D
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*4], E
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*5], F
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*6], G
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*7], H
+
+    add	IDX, 128  	; increment by message block length in bytes
+
+
+
+
+%assign I 0
+%rep 16
+;;;  little endian to big endian
+     vpshufb	APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+    ; Save digests for later addition
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*0], A
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*1], B
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*2], C
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*3], D
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*4], E
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*5], F
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*6], G
+    vmovdqa32	[rsp + _DIGEST_SAVE + 64*7], H
+
+        ; MSG Schedule for W0-W15 is now complete in registers
+        ; Process first (max-rounds -16)
+        ; Calculate next Wt+16 after processing is complete and Wt is unneeded
+
+        ; PROCESS_LOOP_00_79 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M)
+
+%assign I 0
+%assign J 0
+%assign K 1
+%assign L 9
+%assign M 14
+%rep SHA_ROUNDS_LESS_16
+        PROCESS_LOOP  APPEND(W,J),  I
+        MSG_SCHED_ROUND_16_79  APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+%assign I (I+1)
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%endrep
+        ; Check is this is the last block
+        sub 	SIZE, 1
+        je	lastLoop
+
+        ; Process last 16 rounds
+        ; Read in next block msg data for use in first 16 words of msg sched
+%assign I SHA_ROUNDS_LESS_16
+%assign J 0
+%rep 16
+        PROCESS_LOOP  APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+       ; Add old digest
+        vpaddq		A, A, [rsp + _DIGEST_SAVE + 64*0]
+        vpaddq		B, B, [rsp + _DIGEST_SAVE + 64*1]
+        vpaddq		C, C, [rsp + _DIGEST_SAVE + 64*2]
+        vpaddq		D, D, [rsp + _DIGEST_SAVE + 64*3]
+        vpaddq		E, E, [rsp + _DIGEST_SAVE + 64*4]
+        vpaddq		F, F, [rsp + _DIGEST_SAVE + 64*5]
+        vpaddq		G, G, [rsp + _DIGEST_SAVE + 64*6]
+        vpaddq		H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+        jmp	lloop
+
+
+lastLoop:
+        ; Process last 16 rounds
+%assign I SHA_ROUNDS_LESS_16
+%assign J 0
+
+%rep 16
+        PROCESS_LOOP  APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+        ; Add old digest
+        vpaddq		A, A, [rsp + _DIGEST_SAVE + 64*0]
+        vpaddq		B, B, [rsp + _DIGEST_SAVE + 64*1]
+        vpaddq		C, C, [rsp + _DIGEST_SAVE + 64*2]
+        vpaddq		D, D, [rsp + _DIGEST_SAVE + 64*3]
+        vpaddq		E, E, [rsp + _DIGEST_SAVE + 64*4]
+        vpaddq		F, F, [rsp + _DIGEST_SAVE + 64*5]
+        vpaddq		G, G, [rsp + _DIGEST_SAVE + 64*6]
+        vpaddq		H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+;; update into data pointers
+%assign I 0
+%rep 4
+        mov    inp0, [IN + (2*I)*8]
+        mov    inp1, [IN + (2*I +1)*8]
+        add    inp0, IDX
+        add    inp1, IDX
+        mov    [IN + (2*I)*8], inp0
+        mov    [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+        VMOVDQ32	[DIGEST + 0*8*8], A
+        VMOVDQ32	[DIGEST + 1*8*8], B
+        VMOVDQ32	[DIGEST + 2*8*8], C
+        VMOVDQ32	[DIGEST + 3*8*8], D
+        VMOVDQ32	[DIGEST + 4*8*8], E
+        VMOVDQ32	[DIGEST + 5*8*8], F
+        VMOVDQ32	[DIGEST + 6*8*8], G
+        VMOVDQ32	[DIGEST + 7*8*8], H
+
+        mov     rsp, [rsp + _RSP]
+        ret
+
+        section .data
+align 64
+; 80 constants for SHA512
+; replicating for each lane, thus 8*80
+; to aid in SIMD .. space tradeoff for time!
+; local to asm file, used nowhere else
+TABLE:
+    dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22
+    dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd
+    dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+    dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+    dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538
+    dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019
+    dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b
+    dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+    dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242
+    dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe
+    dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c
+    dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+    dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f
+    dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+    dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235
+    dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694
+    dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+    dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3
+    dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+    dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+    dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275
+    dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+    dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+    dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5
+    dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab
+    dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210
+    dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f
+    dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+    dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+    dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725
+    dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f
+    dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70
+    dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+    dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926
+    dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+    dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df
+    dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de
+    dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+    dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+    dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b
+    dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+    dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001
+    dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+    dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30
+    dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218
+    dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910
+    dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a
+    dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+    dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+    dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53
+    dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+    dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+    dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+    dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+    dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+    dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+    dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+    dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60
+    dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+    dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec
+    dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28
+    dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9
+    dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+    dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b
+    dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c
+    dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207
+    dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+    dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+    dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba
+    dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+    dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae
+    dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b
+    dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84
+    dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493
+    dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+    dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c
+    dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+    dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a
+    dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+    dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817
+
+align 64
+; this does the big endian to little endian conversion over a quad word .. ZMM
+;; shuffle on ZMM is shuffle on 4 XMM size chunks, 128 bits
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+			 dq 0x1011121314151617, 0x18191a1b1c1d1e1f
+			 dq 0x2021222324252627, 0x28292a2b2c2d2e2f
+                         dq 0x3031323334353637, 0x38393a3b3c3d3e3f
+
+align 64
+TRANSPOSE8_PERM_INDEX_1: 	dq 0x0000000000000000
+                                dq 0x0000000000000001
+                                dq 0x0000000000000008
+                                dq 0x0000000000000009
+                                dq 0x0000000000000004
+                                dq 0x0000000000000005
+                                dq 0x000000000000000C
+                                dq 0x000000000000000D
+
+TRANSPOSE8_PERM_INDEX_2: 	dq 0x0000000000000002
+                                dq 0x0000000000000003
+                                dq 0x000000000000000A
+                                dq 0x000000000000000B
+                                dq 0x0000000000000006
+                                dq 0x0000000000000007
+                                dq 0x000000000000000E
+                                dq 0x000000000000000F
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha512_mb_x8_avx512
+no_sha512_mb_x8_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm
new file mode 100644
index 000000000..1113a1eea
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm
@@ -0,0 +1,252 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+;;;;;
+; mbin_dispatch_init_avoton parameters
+; Use this function when SSE/00/01 is a minimum requirement
+; if AVOTON is true, then use avoton_func instead of sse_func
+; 1-> function name
+; 2-> SSE/00/01 optimized function used as base
+; 3-> AVX or AVX/02 opt func
+; 4-> AVX2 or AVX/04 opt func
+; 5-> AVOTON opt func
+;;;;;
+%macro mbin_dispatch_init_avoton 5
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		push	mbin_rax
+		push	mbin_rbx
+		push	mbin_rcx
+		push	mbin_rdx
+		push	mbin_rdi
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
+
+		mov	eax, 1
+		cpuid
+		lea	mbin_rdi, [%5 WRT_OPT]
+		and     eax, FLAG_CPUID1_EAX_STEP_MASK
+		cmp     eax, FLAG_CPUID1_EAX_AVOTON
+		; If Avoton, set Avoton symbol and exit
+		cmove   mbin_rsi, mbin_rdi
+		je	_%1_init_done
+
+		and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+		cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+		lea	mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
+		jne	_%1_init_done ; AVX is not available so end
+		mov	mbin_rsi, mbin_rbx
+
+		;; Try for AVX2
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_AVX2
+		lea	mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
+		cmovne	mbin_rsi, mbin_rbx
+
+		;; Does it have xmm and ymm support
+		xor	ecx, ecx
+		xgetbv
+		and	eax, FLAG_XGETBV_EAX_XMM_YMM
+		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+		je	_%1_init_done
+		lea	mbin_rsi, [%2 WRT_OPT]
+
+	_%1_init_done:
+		pop	mbin_rdi
+		pop	mbin_rdx
+		pop	mbin_rcx
+		pop	mbin_rbx
+		pop	mbin_rax
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init6_avoton parameters
+; if AVOTON is true, then use avoton_func instead of sse_func
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_1 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+; 6-> AVX512/06 opt func
+; 7-> AVOTON opt func
+;;;;;
+%macro mbin_dispatch_init6_avoton 7
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		push	mbin_rax
+		push	mbin_rbx
+		push	mbin_rcx
+		push	mbin_rdx
+		push	mbin_rdi
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+		mov	eax, 1
+		cpuid
+		mov	ebx, ecx ; save cpuid1.ecx
+		test	ecx, FLAG_CPUID1_ECX_SSE4_1
+		je	_%1_init_done	  ; Use base function if no SSE4_1
+		lea	mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
+
+		lea	mbin_rdi, [%7 WRT_OPT]
+		and     eax, FLAG_CPUID1_EAX_STEP_MASK
+		cmp     eax, FLAG_CPUID1_EAX_AVOTON
+		; If Avoton, set Avoton symbol and exit
+		cmove   mbin_rsi, mbin_rdi
+		je	_%1_init_done
+
+
+		;; Test for XMM_YMM support/AVX
+		test	ecx, FLAG_CPUID1_ECX_OSXSAVE
+		je	_%1_init_done
+		xor	ecx, ecx
+		xgetbv	; xcr -> edx:eax
+		mov	edi, eax	  ; save xgetvb.eax
+
+		and	eax, FLAG_XGETBV_EAX_XMM_YMM
+		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+		jne	_%1_init_done
+		test	ebx, FLAG_CPUID1_ECX_AVX
+		je	_%1_init_done
+		lea	mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
+
+		;; Test for AVX2
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_AVX2
+		je	_%1_init_done		; No AVX2 possible
+		lea	mbin_rsi, [%5 WRT_OPT] 	; AVX2/04 opt func
+
+		;; Test for AVX512
+		and	edi, FLAG_XGETBV_EAX_ZMM_OPM
+		cmp	edi, FLAG_XGETBV_EAX_ZMM_OPM
+		jne	_%1_init_done	  ; No AVX512 possible
+		and	ebx, FLAGS_CPUID7_EBX_AVX512_G1
+		cmp	ebx, FLAGS_CPUID7_EBX_AVX512_G1
+		lea	mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
+		cmove	mbin_rsi, mbin_rbx
+
+	_%1_init_done:
+		pop	mbin_rdi
+		pop	mbin_rdx
+		pop	mbin_rcx
+		pop	mbin_rbx
+		pop	mbin_rax
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+%endmacro
+
+default rel
+[bits 64]
+
+%define def_wrd 	dq
+%define wrd_sz  	qword
+%define arg1		rsi
+
+; declare the L3 ctx level symbols (these will then call the appropriate
+; L2 symbols)
+extern sha512_ctx_mgr_init_sse
+extern sha512_ctx_mgr_submit_sse
+extern sha512_ctx_mgr_flush_sse
+
+extern sha512_ctx_mgr_init_avx
+extern sha512_ctx_mgr_submit_avx
+extern sha512_ctx_mgr_flush_avx
+
+extern sha512_ctx_mgr_init_avx2
+extern sha512_ctx_mgr_submit_avx2
+extern sha512_ctx_mgr_flush_avx2
+
+extern sha512_ctx_mgr_init_base
+extern sha512_ctx_mgr_submit_base
+extern sha512_ctx_mgr_flush_base
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern sha512_ctx_mgr_init_avx512
+ extern sha512_ctx_mgr_submit_avx512
+ extern sha512_ctx_mgr_flush_avx512
+%endif
+
+extern sha512_ctx_mgr_init_sb_sse4
+extern sha512_ctx_mgr_submit_sb_sse4
+extern sha512_ctx_mgr_flush_sb_sse4
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface sha512_ctx_mgr_init
+mbin_interface sha512_ctx_mgr_submit
+mbin_interface sha512_ctx_mgr_flush
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ ; Reuse mbin_dispatch_init6 through replacing base by sse version
+ mbin_dispatch_init6_avoton sha512_ctx_mgr_init, sha512_ctx_mgr_init_base, \
+			sha512_ctx_mgr_init_sse, sha512_ctx_mgr_init_avx, \
+			sha512_ctx_mgr_init_avx2, sha512_ctx_mgr_init_avx512, \
+			sha512_ctx_mgr_init_sb_sse4
+
+ mbin_dispatch_init6_avoton sha512_ctx_mgr_submit, sha512_ctx_mgr_submit_base, \
+			sha512_ctx_mgr_submit_sse, sha512_ctx_mgr_submit_avx, \
+			sha512_ctx_mgr_submit_avx2, sha512_ctx_mgr_submit_avx512, \
+			sha512_ctx_mgr_submit_sb_sse4
+
+ mbin_dispatch_init6_avoton sha512_ctx_mgr_flush, sha512_ctx_mgr_flush_base, \
+			sha512_ctx_mgr_flush_sse, sha512_ctx_mgr_flush_avx, \
+			sha512_ctx_mgr_flush_avx2, sha512_ctx_mgr_flush_avx512, \
+			sha512_ctx_mgr_flush_sb_sse4
+%else
+ mbin_dispatch_init_avoton sha512_ctx_mgr_init, sha512_ctx_mgr_init_sse, \
+			sha512_ctx_mgr_init_avx, sha512_ctx_mgr_init_avx2, \
+			sha512_ctx_mgr_init_sb_sse4
+
+ mbin_dispatch_init_avoton sha512_ctx_mgr_submit, sha512_ctx_mgr_submit_sse, \
+			sha512_ctx_mgr_submit_avx, sha512_ctx_mgr_submit_avx2, \
+			sha512_ctx_mgr_submit_sb_sse4
+
+ mbin_dispatch_init_avoton sha512_ctx_mgr_flush, sha512_ctx_mgr_flush_sse, \
+			sha512_ctx_mgr_flush_avx, sha512_ctx_mgr_flush_avx2, \
+			sha512_ctx_mgr_flush_sb_sse4
+%endif
+
+
+;;;       func				core, ver, snum
+slversion sha512_ctx_mgr_init,		00,   04,  0175
+slversion sha512_ctx_mgr_submit,	00,   04,  0176
+slversion sha512_ctx_mgr_flush,		00,   04,  0177
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c
new file mode 100644
index 000000000..e9b156a33
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c
@@ -0,0 +1,234 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha512_mb.h"
+#include "endian_helper.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA512 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#define H0 0x6a09e667f3bcc908
+#define H1 0xbb67ae8584caa73b
+#define H2 0x3c6ef372fe94f82b
+#define H3 0xa54ff53a5f1d36f1
+#define H4 0x510e527fade682d1
+#define H5 0x9b05688c2b3e6c1f
+#define H6 0x1f83d9abfb41bd6b
+#define H7 0x5be0cd19137e2179
+
+void sha512_single(const uint8_t * data, uint64_t digest[]);
+
+void sha512_ref(uint8_t * input_data, uint64_t * digest, uint32_t len)
+{
+	uint32_t i, j;
+	uint8_t buf[2 * SHA512_BLOCK_SIZE];
+
+	/* 128 bit lengths not needed as len is uint32_t, so use 64 bit length
+	 * and pad the first 64 bits with zeros. */
+
+	digest[0] = H0;
+	digest[1] = H1;
+	digest[2] = H2;
+	digest[3] = H3;
+	digest[4] = H4;
+	digest[5] = H5;
+	digest[6] = H6;
+	digest[7] = H7;
+
+	i = len;
+	/* Hash the complete blocks */
+	while (i >= SHA512_BLOCK_SIZE) {
+		sha512_single(input_data, digest);
+		input_data += SHA512_BLOCK_SIZE;
+		i -= SHA512_BLOCK_SIZE;
+	}
+
+	/* Copy remainder to a buffer to be padded */
+	memcpy(buf, input_data, i);
+	buf[i++] = 0x80;
+
+	// Pad more than required here and overwrite with length
+	for (j = i; j < (2 * SHA512_BLOCK_SIZE); j++)
+		buf[j] = 0;
+
+	if (i > SHA512_BLOCK_SIZE - SHA512_PADLENGTHFIELD_SIZE)
+		i = 2 * SHA512_BLOCK_SIZE;
+	else
+		i = SHA512_BLOCK_SIZE;
+
+	*(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+	/* Hash the padded last block */
+	sha512_single(buf, digest);
+	if (i == 256)
+		sha512_single(buf + 128, digest);
+}
+
+/* From the FIPS, these are the same as for SHA256, but operating on 64 bit words
+ * instead of 32 bit.
+ */
+#define ch(e,f,g) ((e & f) ^ (g & ~e))
+#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c))
+
+/* Sigma functions have same form as SHA256 but
+ * 	- change the word size to 64bit
+ * 	- change the amount to rotate
+ */
+#define ror64(x, r) (((x)>>(r)) ^ ((x)<<(64-(r))))
+
+/* Technically, s0 should be S0 as these are "capital sigma" functions, and likewise the case
+ * of the  S0 should be s0, but keep as-is to avoid confusion with the other reference functions.
+ */
+#define s0(a) (ror64(a,28) ^ ror64(a,34) ^ ror64(a,39))
+#define s1(e) (ror64(e,14) ^ ror64(e,18) ^ ror64(e,41))
+
+#define S0(w) (ror64(w,1) ^ ror64(w,8) ^ (w >> 7))
+#define S1(w) (ror64(w,19) ^ ror64(w,61) ^ (w >> 6))
+
+#define W(x) w[(x) & 15]
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+	if (i<16) W(i) = to_be64(ww[i]); \
+	else \
+	W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+	t2 = s0(a) + maj(a,b,c); \
+	t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+	d += t1; \
+	h = t1 + t2;
+
+void sha512_single(const uint8_t * data, uint64_t digest[])
+{
+	/* Check these are all uint64_t */
+	uint64_t a, b, c, d, e, f, g, h, t1, t2;
+	uint64_t w[16];
+	uint64_t *ww = (uint64_t *) data;
+
+	a = digest[0];
+	b = digest[1];
+	c = digest[2];
+	d = digest[3];
+	e = digest[4];
+	f = digest[5];
+	g = digest[6];
+	h = digest[7];
+
+	step(0, a, b, c, d, e, f, g, h, 0x428a2f98d728ae22);
+	step(1, h, a, b, c, d, e, f, g, 0x7137449123ef65cd);
+	step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcfec4d3b2f);
+	step(3, f, g, h, a, b, c, d, e, 0xe9b5dba58189dbbc);
+	step(4, e, f, g, h, a, b, c, d, 0x3956c25bf348b538);
+	step(5, d, e, f, g, h, a, b, c, 0x59f111f1b605d019);
+	step(6, c, d, e, f, g, h, a, b, 0x923f82a4af194f9b);
+	step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5da6d8118);
+	step(8, a, b, c, d, e, f, g, h, 0xd807aa98a3030242);
+	step(9, h, a, b, c, d, e, f, g, 0x12835b0145706fbe);
+	step(10, g, h, a, b, c, d, e, f, 0x243185be4ee4b28c);
+	step(11, f, g, h, a, b, c, d, e, 0x550c7dc3d5ffb4e2);
+	step(12, e, f, g, h, a, b, c, d, 0x72be5d74f27b896f);
+	step(13, d, e, f, g, h, a, b, c, 0x80deb1fe3b1696b1);
+	step(14, c, d, e, f, g, h, a, b, 0x9bdc06a725c71235);
+	step(15, b, c, d, e, f, g, h, a, 0xc19bf174cf692694);
+	step(16, a, b, c, d, e, f, g, h, 0xe49b69c19ef14ad2);
+	step(17, h, a, b, c, d, e, f, g, 0xefbe4786384f25e3);
+	step(18, g, h, a, b, c, d, e, f, 0x0fc19dc68b8cd5b5);
+	step(19, f, g, h, a, b, c, d, e, 0x240ca1cc77ac9c65);
+	step(20, e, f, g, h, a, b, c, d, 0x2de92c6f592b0275);
+	step(21, d, e, f, g, h, a, b, c, 0x4a7484aa6ea6e483);
+	step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dcbd41fbd4);
+	step(23, b, c, d, e, f, g, h, a, 0x76f988da831153b5);
+	step(24, a, b, c, d, e, f, g, h, 0x983e5152ee66dfab);
+	step(25, h, a, b, c, d, e, f, g, 0xa831c66d2db43210);
+	step(26, g, h, a, b, c, d, e, f, 0xb00327c898fb213f);
+	step(27, f, g, h, a, b, c, d, e, 0xbf597fc7beef0ee4);
+	step(28, e, f, g, h, a, b, c, d, 0xc6e00bf33da88fc2);
+	step(29, d, e, f, g, h, a, b, c, 0xd5a79147930aa725);
+	step(30, c, d, e, f, g, h, a, b, 0x06ca6351e003826f);
+	step(31, b, c, d, e, f, g, h, a, 0x142929670a0e6e70);
+	step(32, a, b, c, d, e, f, g, h, 0x27b70a8546d22ffc);
+	step(33, h, a, b, c, d, e, f, g, 0x2e1b21385c26c926);
+	step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc5ac42aed);
+	step(35, f, g, h, a, b, c, d, e, 0x53380d139d95b3df);
+	step(36, e, f, g, h, a, b, c, d, 0x650a73548baf63de);
+	step(37, d, e, f, g, h, a, b, c, 0x766a0abb3c77b2a8);
+	step(38, c, d, e, f, g, h, a, b, 0x81c2c92e47edaee6);
+	step(39, b, c, d, e, f, g, h, a, 0x92722c851482353b);
+	step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a14cf10364);
+	step(41, h, a, b, c, d, e, f, g, 0xa81a664bbc423001);
+	step(42, g, h, a, b, c, d, e, f, 0xc24b8b70d0f89791);
+	step(43, f, g, h, a, b, c, d, e, 0xc76c51a30654be30);
+	step(44, e, f, g, h, a, b, c, d, 0xd192e819d6ef5218);
+	step(45, d, e, f, g, h, a, b, c, 0xd69906245565a910);
+	step(46, c, d, e, f, g, h, a, b, 0xf40e35855771202a);
+	step(47, b, c, d, e, f, g, h, a, 0x106aa07032bbd1b8);
+	step(48, a, b, c, d, e, f, g, h, 0x19a4c116b8d2d0c8);
+	step(49, h, a, b, c, d, e, f, g, 0x1e376c085141ab53);
+	step(50, g, h, a, b, c, d, e, f, 0x2748774cdf8eeb99);
+	step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5e19b48a8);
+	step(52, e, f, g, h, a, b, c, d, 0x391c0cb3c5c95a63);
+	step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4ae3418acb);
+	step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f7763e373);
+	step(55, b, c, d, e, f, g, h, a, 0x682e6ff3d6b2b8a3);
+	step(56, a, b, c, d, e, f, g, h, 0x748f82ee5defb2fc);
+	step(57, h, a, b, c, d, e, f, g, 0x78a5636f43172f60);
+	step(58, g, h, a, b, c, d, e, f, 0x84c87814a1f0ab72);
+	step(59, f, g, h, a, b, c, d, e, 0x8cc702081a6439ec);
+	step(60, e, f, g, h, a, b, c, d, 0x90befffa23631e28);
+	step(61, d, e, f, g, h, a, b, c, 0xa4506cebde82bde9);
+	step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7b2c67915);
+	step(63, b, c, d, e, f, g, h, a, 0xc67178f2e372532b);	// step 63
+	step(64, a, b, c, d, e, f, g, h, 0xca273eceea26619c);
+	step(65, h, a, b, c, d, e, f, g, 0xd186b8c721c0c207);
+	step(66, g, h, a, b, c, d, e, f, 0xeada7dd6cde0eb1e);
+	step(67, f, g, h, a, b, c, d, e, 0xf57d4f7fee6ed178);
+	step(68, e, f, g, h, a, b, c, d, 0x06f067aa72176fba);
+	step(69, d, e, f, g, h, a, b, c, 0x0a637dc5a2c898a6);
+	step(70, c, d, e, f, g, h, a, b, 0x113f9804bef90dae);
+	step(71, b, c, d, e, f, g, h, a, 0x1b710b35131c471b);
+	step(72, a, b, c, d, e, f, g, h, 0x28db77f523047d84);
+	step(73, h, a, b, c, d, e, f, g, 0x32caab7b40c72493);
+	step(74, g, h, a, b, c, d, e, f, 0x3c9ebe0a15c9bebc);
+	step(75, f, g, h, a, b, c, d, e, 0x431d67c49c100d4c);
+	step(76, e, f, g, h, a, b, c, d, 0x4cc5d4becb3e42b6);
+	step(77, d, e, f, g, h, a, b, c, 0x597f299cfc657e2a);
+	step(78, c, d, e, f, g, h, a, b, 0x5fcb6fab3ad6faec);
+	step(79, b, c, d, e, f, g, h, a, 0x6c44198c4a475817);	// step 79
+
+	digest[0] += a;
+	digest[1] += b;
+	digest[2] += c;
+	digest[3] += d;
+	digest[4] += e;
+	digest[5] += f;
+	digest[6] += g;
+	digest[7] += h;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c
new file mode 100644
index 000000000..6650b0106
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c
@@ -0,0 +1,46 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include "sha512_mb.h"
+
+/*
+ * Function: sha512_sb_mgr_flush_sse4.
+ *
+ * Description: This is a dummy API. Nothing done here.
+ *
+ * Return: always NULL.
+ *
+ * */
+SHA512_JOB *sha512_sb_mgr_flush_sse4(SHA512_MB_JOB_MGR * state)
+{
+	return NULL;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c
new file mode 100644
index 000000000..69df5600d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c
@@ -0,0 +1,38 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+
+// For single buffer APIs, nothing to be done here.
+// This function is required, to comply with the usage of
+// multi-buffer APIs.
+void sha512_sb_mgr_init_sse4(SHA512_MB_JOB_MGR * state)
+{
+	return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c
new file mode 100644
index 000000000..96e1a5ee4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c
@@ -0,0 +1,65 @@
+/**********************************************************************
+  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include "sha512_mb.h"
+
+/*
+ * Function: sha512_sb_mgr_submit_sse4
+ *
+ * Description: Wrapper API for update routine of single buffer sha512,
+ *              to comply with multi-buffer API.
+ *
+ *              This function will pick up message/digest and length
+ *              information from  the argument "job", then call into
+ *              sha512_sse4(). Argument "state" is passed in, but not
+ *              really used here.
+ *
+ *              Note: message init and padding is done outside. This function
+ *              expects a packed buffer.
+ *
+ * Argument: state - not really used.
+ *           job - contained message, digest, message length information, etc.
+ *
+ * Return: SHA512_JOB pointer.
+ *
+ **/
+SHA512_JOB *sha512_sb_mgr_submit_sse4(SHA512_MB_JOB_MGR * state, SHA512_JOB * job)
+{
+	assert(job != NULL);
+
+	uint8_t *buff = job->buffer;
+	uint64_t *digest = job->result_digest, len = job->len;
+
+	sha512_sse4((const void *)buff, (void *)digest, len);
+
+	return job;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm
new file mode 100644
index 000000000..8b43bce5e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm
@@ -0,0 +1,396 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+; Virtual Registers
+%ifidn __OUTPUT_FORMAT__, win64
+	%define msg     rcx ; ARG1
+	%define digest  rdx ; ARG2
+	%define msglen  r8  ; ARG3
+	%define T1      rsi
+	%define T2      rdi
+%else
+	%define msg     rdi ; ARG1
+	%define digest  rsi ; ARG2
+	%define msglen  rdx ; ARG3
+	%define T1      rcx
+	%define T2      r8
+%endif
+%define a_64    r9
+%define b_64    r10
+%define c_64    r11
+%define d_64    r12
+%define e_64    r13
+%define f_64    r14
+%define g_64    r15
+%define h_64    rbx
+%define tmp0    rax
+
+; Local variables (stack frame)
+; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
+struc frame
+	.W:       resq 80 ; Message Schedule
+	.WK:      resq  2 ; W[t] + K[t] | W[t+1] + K[t+1]
+
+%ifidn __OUTPUT_FORMAT__, win64
+	.GPRSAVE: resq 7
+%else
+	.GPRSAVE: resq 5
+%endif
+endstruc
+
+; Useful QWORD "arrays" for simpler memory references
+%define MSG(i)    msg    + 8*(i)               ; Input message (arg1)
+%define DIGEST(i) digest + 8*(i)               ; Output Digest (arg2)
+%define K_t(i)    K512   + 8*(i)               ; SHA Constants (static mem)
+%define W_t(i)    rsp + frame.W  + 8*(i)       ; Message Schedule (stack frame)
+%define WK_2(i)   rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame)
+; MSG, DIGEST, K_t, W_t are arrays
+; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+
+%macro RotateState 0
+	; Rotate symbles a..h right
+	%xdefine %%TMP h_64
+	%xdefine h_64  g_64
+	%xdefine g_64  f_64
+	%xdefine f_64  e_64
+	%xdefine e_64  d_64
+	%xdefine d_64  c_64
+	%xdefine c_64  b_64
+	%xdefine b_64  a_64
+	%xdefine a_64  %%TMP
+%endmacro
+
+%macro SHA512_Round 1
+%assign %%t   (%1)
+
+	; Compute Round %%t
+	mov     T1,   f_64        ; T1 = f
+	mov     tmp0, e_64        ; tmp = e
+	xor     T1,   g_64        ; T1 = f ^ g
+	ror     tmp0, 23 ; 41     ; tmp = e ror 23
+	and     T1,   e_64        ; T1 = (f ^ g) & e
+	xor     tmp0, e_64        ; tmp = (e ror 23) ^ e
+	xor     T1,   g_64        ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+	add     T1,   [WK_2(%%t)] ; W[t] + K[t] from message scheduler
+	ror     tmp0, 4 ; 18      ; tmp = ((e ror 23) ^ e) ror 4
+	xor     tmp0, e_64        ; tmp = (((e ror 23) ^ e) ror 4) ^ e
+	mov     T2,   a_64        ; T2 = a
+	add     T1,   h_64        ; T1 = CH(e,f,g) + W[t] + K[t] + h
+	ror     tmp0, 14 ; 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+	add     T1,   tmp0        ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+	mov     tmp0, a_64        ; tmp = a
+	xor     T2,   c_64        ; T2 = a ^ c
+	and     tmp0, c_64        ; tmp = a & c
+	and     T2,   b_64        ; T2 = (a ^ c) & b
+	xor     T2,   tmp0        ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+	mov     tmp0, a_64        ; tmp = a
+	ror     tmp0, 5 ; 39      ; tmp = a ror 5
+	xor     tmp0, a_64        ; tmp = (a ror 5) ^ a
+	add     d_64, T1          ; e(next_state) = d + T1
+	ror     tmp0, 6 ; 34      ; tmp = ((a ror 5) ^ a) ror 6
+	xor     tmp0, a_64        ; tmp = (((a ror 5) ^ a) ror 6) ^ a
+	lea     h_64, [T1 + T2]   ; a(next_state) = T1 + Maj(a,b,c)
+	ror     tmp0, 28 ; 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+	add     h_64, tmp0        ; a(next_state) = T1 + Maj(a,b,c) S0(a)
+	RotateState
+%endmacro
+
+%macro SHA512_2Sched_2Round_sse 1
+%assign %%t (%1)
+
+	; Compute rounds %%t-2 and %%t-1
+	; Compute message schedule QWORDS %%t and %%t+1
+
+	;   Two rounds are computed based on the values for K[t-2]+W[t-2] and
+	; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+	; scheduler.
+	;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
+	; They are then added to their respective SHA512 constants at
+	; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
+	;   For brievity, the comments following vectored instructions only refer to
+	; the first of a pair of QWORDS.
+	; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
+	;   The computation of the message schedule and the rounds are tightly
+	; stitched to take advantage of instruction-level parallelism.
+	; For clarity, integer instructions (for the rounds calculation) are indented
+	; by one tab. Vectored instructions (for the message scheduler) are indented
+	; by two tabs.
+
+	mov     T1, f_64
+	movdqa  xmm2, [W_t(%%t-2)]  ; XMM2 = W[t-2]
+	xor     T1,   g_64
+	and     T1,   e_64
+	movdqa  xmm0, xmm2          ; XMM0 = W[t-2]
+	xor     T1,   g_64
+	add     T1,   [WK_2(%%t)]
+	movdqu  xmm5, [W_t(%%t-15)] ; XMM5 = W[t-15]
+	mov     tmp0, e_64
+	ror     tmp0, 23 ; 41
+	movdqa  xmm3, xmm5          ; XMM3 = W[t-15]
+	xor     tmp0, e_64
+	ror     tmp0, 4 ; 18
+	psrlq   xmm0, 61 - 19       ; XMM0 = W[t-2] >> 42
+	xor     tmp0, e_64
+	ror     tmp0, 14 ; 14
+	psrlq   xmm3, (8 - 7)       ; XMM3 = W[t-15] >> 1
+	add     T1,   tmp0
+	add     T1,   h_64
+	pxor    xmm0, xmm2          ; XMM0 = (W[t-2] >> 42) ^ W[t-2]
+	mov     T2,   a_64
+	xor     T2,   c_64
+	pxor    xmm3, xmm5          ; XMM3 = (W[t-15] >> 1) ^ W[t-15]
+	and     T2,   b_64
+	mov     tmp0, a_64
+	psrlq   xmm0, 19 - 6        ; XMM0 = ((W[t-2]>>42)^W[t-2])>>13
+	and     tmp0, c_64
+	xor     T2,   tmp0
+	psrlq   xmm3, (7 - 1)       ; XMM3 = ((W[t-15]>>1)^W[t-15])>>6
+	mov     tmp0, a_64
+	ror     tmp0, 5 ; 39
+	pxor    xmm0, xmm2          ; XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
+	xor     tmp0, a_64
+	ror     tmp0, 6 ; 34
+	pxor    xmm3, xmm5          ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
+	xor     tmp0, a_64
+	ror     tmp0, 28 ; 28
+	psrlq   xmm0, 6             ; XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
+	add     T2,   tmp0
+	add     d_64, T1
+	psrlq   xmm3, 1             ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
+	lea     h_64, [T1 + T2]
+	RotateState
+	movdqa  xmm1, xmm2          ; XMM1 = W[t-2]
+	mov     T1, f_64
+	xor     T1,   g_64
+	movdqa  xmm4, xmm5          ; XMM4 = W[t-15]
+	and     T1,   e_64
+	xor     T1,   g_64
+	psllq   xmm1, (64 - 19) - (64 - 61) ; XMM1 = W[t-2] << 42
+	add     T1,   [WK_2(%%t+1)]
+	mov     tmp0, e_64
+	psllq   xmm4, (64 - 1) - (64 - 8) ; XMM4 = W[t-15] << 7
+	ror     tmp0, 23 ; 41
+	xor     tmp0, e_64
+	pxor    xmm1, xmm2          ; XMM1 = (W[t-2] << 42)^W[t-2]
+	ror     tmp0, 4 ; 18
+	xor     tmp0, e_64
+	pxor    xmm4, xmm5          ; XMM4 = (W[t-15]<<7)^W[t-15]
+	ror     tmp0, 14 ; 14
+	add     T1,   tmp0
+	psllq   xmm1, (64 - 61)     ; XMM1 = ((W[t-2] << 42)^W[t-2])<<3
+	add     T1,   h_64
+	mov     T2,   a_64
+	psllq   xmm4, (64 - 8)      ; XMM4 = ((W[t-15]<<7)^W[t-15])<<56
+	xor     T2,   c_64
+	and     T2,   b_64
+	pxor    xmm0, xmm1          ; XMM0 = s1(W[t-2])
+	mov     tmp0, a_64
+	and     tmp0, c_64
+	movdqu  xmm1, [W_t(%%t- 7)] ; XMM1 = W[t-7]
+	xor     T2,   tmp0
+	pxor    xmm3, xmm4          ; XMM3 = s0(W[t-15])
+	mov     tmp0, a_64
+	paddq   xmm0, xmm3          ; XMM0 = s1(W[t-2]) + s0(W[t-15])
+	ror     tmp0, 5 ; 39
+	paddq   xmm0, [W_t(%%t-16)] ; XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
+	xor     tmp0, a_64
+	paddq   xmm0, xmm1          ; XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
+	ror     tmp0, 6 ; 34
+	movdqa  [W_t(%%t)], xmm0    ; Store scheduled qwords
+	xor     tmp0, a_64
+	paddq   xmm0, [K_t(t)]      ; Compute W[t]+K[t]
+	ror     tmp0, 28 ; 28
+	movdqa  [WK_2(t)], xmm0     ; Store W[t]+K[t] for next rounds
+	add     T2,   tmp0
+	add     d_64, T1
+	lea     h_64, [T1 + T2]
+	RotateState
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; void sha512_sse4(const void* M, void* D, uint64_t L);
+; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+; The size of the message pointed to by M must be an integer multiple of SHA512
+;   message blocks.
+; L is the message length in SHA512 blocks.
+mk_global sha512_sse4, function
+sha512_sse4:
+	endbranch
+	cmp msglen, 0
+	je .nowork
+
+	; Allocate Stack Space
+	sub     rsp, frame_size
+
+	; Save GPRs
+	mov     [rsp + frame.GPRSAVE + 8 * 0], rbx
+	mov     [rsp + frame.GPRSAVE + 8 * 1], r12
+	mov     [rsp + frame.GPRSAVE + 8 * 2], r13
+	mov     [rsp + frame.GPRSAVE + 8 * 3], r14
+	mov     [rsp + frame.GPRSAVE + 8 * 4], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + frame.GPRSAVE + 8 * 5], rsi
+	mov     [rsp + frame.GPRSAVE + 8 * 6], rdi
+%endif
+
+.updateblock:
+
+	; Load state variables
+	mov     a_64, [DIGEST(0)]
+	mov     b_64, [DIGEST(1)]
+	mov     c_64, [DIGEST(2)]
+	mov     d_64, [DIGEST(3)]
+	mov     e_64, [DIGEST(4)]
+	mov     f_64, [DIGEST(5)]
+	mov     g_64, [DIGEST(6)]
+	mov     h_64, [DIGEST(7)]
+
+	%assign t 0
+	%rep 80/2 + 1
+	; (80 rounds) / (2 rounds/iteration) + (1 iteration)
+	; +1 iteration because the scheduler leads hashing by 1 iteration
+		%if t < 2
+			; BSWAP 2 QWORDS
+			movdqa  xmm1, [XMM_QWORD_BSWAP]
+			movdqu  xmm0, [MSG(t)]
+			pshufb  xmm0, xmm1      ; BSWAP
+			movdqa  [W_t(t)], xmm0  ; Store Scheduled Pair
+			paddq   xmm0, [K_t(t)]  ; Compute W[t]+K[t]
+			movdqa  [WK_2(t)], xmm0 ; Store into WK for rounds
+		%elif t < 16
+			; BSWAP 2 QWORDS; Compute 2 Rounds
+			movdqu  xmm0, [MSG(t)]
+			pshufb  xmm0, xmm1      ; BSWAP
+			SHA512_Round t - 2      ; Round t-2
+			movdqa  [W_t(t)], xmm0  ; Store Scheduled Pair
+			paddq   xmm0, [K_t(t)]  ; Compute W[t]+K[t]
+			SHA512_Round t - 1      ; Round t-1
+			movdqa  [WK_2(t)], xmm0 ; Store W[t]+K[t] into WK
+		%elif t < 79
+			; Schedule 2 QWORDS; Compute 2 Rounds
+			SHA512_2Sched_2Round_sse t
+		%else
+			; Compute 2 Rounds
+			SHA512_Round t - 2
+			SHA512_Round t - 1
+		%endif
+	%assign t t+2
+	%endrep
+
+	; Update digest
+	add     [DIGEST(0)], a_64
+	add     [DIGEST(1)], b_64
+	add     [DIGEST(2)], c_64
+	add     [DIGEST(3)], d_64
+	add     [DIGEST(4)], e_64
+	add     [DIGEST(5)], f_64
+	add     [DIGEST(6)], g_64
+	add     [DIGEST(7)], h_64
+
+	; Advance to next message block
+	add     msg, 16*8
+	dec     msglen
+	jnz     .updateblock
+
+	; Restore GPRs
+	mov     rbx, [rsp + frame.GPRSAVE + 8 * 0]
+	mov     r12, [rsp + frame.GPRSAVE + 8 * 1]
+	mov     r13, [rsp + frame.GPRSAVE + 8 * 2]
+	mov     r14, [rsp + frame.GPRSAVE + 8 * 3]
+	mov     r15, [rsp + frame.GPRSAVE + 8 * 4]
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     rsi, [rsp + frame.GPRSAVE + 8 * 5]
+	mov     rdi, [rsp + frame.GPRSAVE + 8 * 6]
+%endif
+	; Restore Stack Pointer
+	add     rsp, frame_size
+
+.nowork:
+	ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Binary Data
+
+section .data
+
+ALIGN 16
+
+; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+	dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+; K[t] used in SHA512 hashing
+K512:
+	dq 0x428a2f98d728ae22,0x7137449123ef65cd
+	dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	dq 0x3956c25bf348b538,0x59f111f1b605d019
+	dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	dq 0xd807aa98a3030242,0x12835b0145706fbe
+	dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	dq 0x9bdc06a725c71235,0xc19bf174cf692694
+	dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	dq 0x983e5152ee66dfab,0xa831c66d2db43210
+	dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
+	dq 0xc6e00bf33da88fc2,0xd5a79147930aa725
+	dq 0x06ca6351e003826f,0x142929670a0e6e70
+	dq 0x27b70a8546d22ffc,0x2e1b21385c26c926
+	dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	dq 0x650a73548baf63de,0x766a0abb3c77b2a8
+	dq 0x81c2c92e47edaee6,0x92722c851482353b
+	dq 0xa2bfe8a14cf10364,0xa81a664bbc423001
+	dq 0xc24b8b70d0f89791,0xc76c51a30654be30
+	dq 0xd192e819d6ef5218,0xd69906245565a910
+	dq 0xf40e35855771202a,0x106aa07032bbd1b8
+	dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	dq 0x748f82ee5defb2fc,0x78a5636f43172f60
+	dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
+	dq 0x90befffa23631e28,0xa4506cebde82bde9
+	dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
+	dq 0xca273eceea26619c,0xd186b8c721c0c207
+	dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	dq 0x06f067aa72176fba,0x0a637dc5a2c898a6
+	dq 0x113f9804bef90dae,0x1b710b35131c471b
+	dq 0x28db77f523047d84,0x32caab7b40c72493
+	dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am
new file mode 100644
index 000000000..8f8a3f4a6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am
@@ -0,0 +1,121 @@
+########################################################################
+#  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_x86_64 += sm3_mb/sm3_ctx_base.c \
+	sm3_mb/sm3_multibinary.asm
+
+lsrc_base_aliases += sm3_mb/sm3_ctx_base.c \
+	sm3_mb/sm3_ctx_base_aliases.c
+
+lsrc_aarch64 += sm3_mb/sm3_ctx_base.c \
+	sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c \
+	sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S \
+	sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c	\
+	sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c	\
+	sm3_mb/aarch64/sm3_mb_sm_x1.S		\
+	sm3_mb/aarch64/sm3_mb_sm_x2.S		\
+	sm3_mb/aarch64/sm3_mb_sm_x3.S		\
+	sm3_mb/aarch64/sm3_mb_sm_x4.S		\
+	sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c	\
+	sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c	\
+	sm3_mb/aarch64/sm3_mb_asimd_x1.S	\
+	sm3_mb/aarch64/sm3_mb_asimd_x4.S
+
+
+src_include += -I $(srcdir)/sm3_mb
+
+extern_hdrs +=	include/sm3_mb.h \
+		include/multi_buffer.h
+
+lsrc_x86_64 +=	sm3_mb/sm3_ctx_avx512.c \
+		sm3_mb/sm3_mb_mgr_submit_avx512.asm \
+		sm3_mb/sm3_mb_mgr_flush_avx512.asm \
+		sm3_mb/sm3_mb_x16_avx512.asm
+
+lsrc_x86_64 += sm3_mb/sm3_ctx_avx2.c \
+		sm3_mb/sm3_mb_mgr_submit_avx2.asm \
+		sm3_mb/sm3_mb_mgr_flush_avx2.asm \
+		sm3_mb/sm3_mb_x8_avx2.asm
+
+other_src +=	include/datastruct.asm \
+		include/multibinary.asm \
+		include/reg_sizes.asm \
+		include/memcpy_inline.h \
+		include/memcpy.asm \
+		include/intrinreg.h \
+		sm3_mb/sm3_job.asm \
+		sm3_mb/sm3_mb_mgr_datastruct.asm \
+		sm3_mb/sm3_test_helper.c
+
+check_tests  +=	sm3_mb/sm3_ref_test
+
+unit_tests   +=	sm3_mb/sm3_mb_rand_ssl_test \
+		sm3_mb/sm3_mb_rand_test \
+		sm3_mb/sm3_mb_rand_update_test \
+		sm3_mb/sm3_mb_flush_test \
+		sm3_mb/sm3_mb_test
+
+perf_tests   += sm3_mb/sm3_mb_vs_ossl_perf \
+		sm3_mb/sm3_mb_vs_ossl_shortage_perf
+
+sm3_mb_rand_ssl_test: LDLIBS += -lcrypto
+sm3_mb_sm3_mb_rand_ssl_test_LDFLAGS = -lcrypto
+
+sm3_mb_rand_ssl_test: sm3_test_helper.o
+sm3_mb_sm3_mb_rand_ssl_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la
+
+sm3_mb_rand_update_test: LDLIBS += -lcrypto
+sm3_mb_sm3_mb_rand_update_test_LDFLAGS = -lcrypto
+
+sm3_mb_rand_update_test: sm3_test_helper.o
+sm3_mb_sm3_mb_rand_update_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la
+
+sm3_mb_flush_test: LDLIBS += -lcrypto
+sm3_mb_sm3_mb_flush_test_LDFLAGS = -lcrypto
+
+sm3_mb_flush_test: sm3_test_helper.o
+sm3_mb_sm3_mb_flush_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la
+
+sm3_mb_rand_test: LDLIBS += -lcrypto
+sm3_mb_sm3_mb_rand_test_LDFLAGS = -lcrypto
+
+sm3_mb_rand_test: sm3_test_helper.o
+sm3_mb_sm3_mb_rand_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la
+
+sm3_mb_vs_ossl_perf: LDLIBS += -lcrypto
+sm3_mb_sm3_mb_vs_ossl_perf_LDFLAGS = -lcrypto
+
+sm3_mb_vs_ossl_perf: sm3_test_helper.o
+sm3_mb_sm3_mb_vs_ossl_perf_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la
+
+sm3_mb_vs_ossl_shortage_perf: LDLIBS += -lcrypto
+sm3_mb_sm3_mb_vs_ossl_shortage_perf_LDFLAGS = -lcrypto
+
+sm3_mb_vs_ossl_shortage_perf: sm3_test_helper.o
+sm3_mb_sm3_mb_vs_ossl_shortage_perf_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c
new file mode 100644
index 000000000..208a7414e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c
@@ -0,0 +1,65 @@
+/**********************************************************************
+  Copyright(c) 2019-2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(sm3_ctx_mgr_submit)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SM3)
+		return PROVIDER_INFO(sm3_ctx_mgr_submit_sm);
+	if (auxval & HWCAP_ASIMD)
+		return PROVIDER_INFO(sm3_ctx_mgr_submit_asimd);
+
+	return PROVIDER_BASIC(sm3_ctx_mgr_submit);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sm3_ctx_mgr_init)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SM3)
+		return PROVIDER_INFO(sm3_ctx_mgr_init_sm);
+	if (auxval & HWCAP_ASIMD)
+		return PROVIDER_INFO(sm3_ctx_mgr_init_asimd);
+
+	return PROVIDER_BASIC(sm3_ctx_mgr_init);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sm3_ctx_mgr_flush)
+{
+	unsigned long auxval = getauxval(AT_HWCAP);
+	if (auxval & HWCAP_SM3)
+		return PROVIDER_INFO(sm3_ctx_mgr_flush_sm);
+	if (auxval & HWCAP_ASIMD)
+		return PROVIDER_INFO(sm3_ctx_mgr_flush_asimd);
+
+	return PROVIDER_BASIC(sm3_ctx_mgr_flush);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S
new file mode 100644
index 000000000..c7362de90
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S
@@ -0,0 +1,387 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE
+  OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8.2-a
+	.text
+	.align	2
+	.p2align 3,,7
+
+.macro	declare_var_vector_reg name:req,reg:req
+	q\name\()	.req	q\reg
+	v\name\()	.req	v\reg
+	s\name\()	.req	s\reg
+.endm
+
+	job	.req	x0
+	len	.req	x1
+	data	.req	x2
+	digest	.req	x0
+
+	msg0	.req	w3
+	msg1	.req	w4
+	msg2	.req	w5
+	msg3	.req	w6
+	msg4	.req	w7
+
+	msg	.req	w9
+	msgP	.req	w10
+	SS1	.req	w11
+	SS2	.req	w12
+	TT1	.req	w13
+	TT2	.req	w14
+	Tj	.req	w15
+	tmp0	.req	w19
+	tmp1	.req	w20
+	dig_A	.req	w21
+	dig_B	.req	w22
+	dig_C	.req	w23
+	dig_D	.req	w24
+	dig_E	.req	w25
+	dig_F	.req	w26
+	dig_G	.req	w27
+	dig_H	.req	w28
+
+	declare_var_vector_reg	dig0,0
+	declare_var_vector_reg	dig1,1
+	declare_var_vector_reg	dig0_bak,2
+	declare_var_vector_reg	dig1_bak,3
+	declare_var_vector_reg	vect_msg0,4
+	declare_var_vector_reg	vect_msg1,5
+	declare_var_vector_reg	vect_msg2,6
+	declare_var_vector_reg	vect_msg3,7
+
+	declare_var_vector_reg	vect_msgP0,16
+	declare_var_vector_reg	vect_msgP1,17
+	declare_var_vector_reg	vect_msgP2,18
+
+
+
+
+
+
+// round 0-11
+.macro sm3_round_0	round:req
+	ldr	msg, [sp,msg_off+4*\round\()]
+	ldr	msgP,[sp,wp_off +4*\round\()]
+	add	SS1,dig_E,Tj
+	ror	TT1,dig_A,32-12
+	add	SS1,SS1,TT1
+	ror	SS1,SS1,32-7	//SS1 done
+	eor	SS2,SS1,TT1	//SS2 done
+	eor	TT1,dig_A,dig_B
+	eor	TT2,dig_E,dig_F
+	add	SS2,SS2,msgP
+	eor	TT2,TT2,dig_G
+	add	SS1,SS1,msg
+	eor	TT1,TT1,dig_C
+	add	SS2,SS2,dig_D
+	add	SS1,SS1,dig_H
+	add	TT1,TT1,SS2
+	add	TT2,TT2,SS1
+	mov	dig_D,dig_C
+	ror	dig_C,dig_B,32-9
+	mov	dig_B,dig_A
+	mov	dig_A,TT1
+	eor	TT1,TT2,TT2,ror (32-17)
+	mov	dig_H,dig_G
+	ror	dig_G,dig_F,32-19
+	mov	dig_F,dig_E
+	eor	dig_E,TT1,TT2,ror(32-9)
+	ror	Tj,Tj,(32-1)
+.endm
+
+//round 12-15
+.macro sm3_round_12	round:req
+	ldr	msg, [sp,msg_off+4*((\round\())%17)]
+	ldr	msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)]
+	ldr	msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)]
+	add	SS1,dig_E,Tj
+	ror	TT1,dig_A,32-12
+	add	SS1,SS1,TT1
+	ror	SS1,SS1,32-7	//SS1 done
+	eor	SS2,SS1,TT1	//SS2 done
+
+	eor	msg0,msg0,msg1
+	ldr	msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)]
+	eor	TT1,dig_A,dig_B
+	eor	TT2,dig_E,dig_F
+	add	SS2,SS2,dig_D
+	eor	TT2,TT2,dig_G
+	add	SS1,SS1,msg
+	eor	msg0,msg0,msg2,ror (32-15)
+	ldr	msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)]
+	ldr	msg4,[sp,msg_off+4*((\round\()+4 -  6)%17)]
+	eor	msg1,msg0,msg0,ror (32 -15)
+	eor	TT1,TT1,dig_C
+	add	TT1,TT1,SS2
+	eor	msg4,msg4,msg3, ror (32-7)
+	eor	msg0,msg1,msg0, ror (32-23)
+	add	SS1,SS1,dig_H
+	eor	msg0,msg0,msg4
+	add	TT2,TT2,SS1
+	mov	dig_D,dig_C
+	str	msg0,[sp,msg_off+4*((\round\()+4)%17)]
+	eor	msgP,msg,msg0
+	add	TT1,TT1,msgP
+	ror	dig_C,dig_B,32-9
+	mov	dig_B,dig_A
+	mov	dig_A,TT1
+	eor	TT1,TT2,TT2,ror (32-17)
+	mov	dig_H,dig_G
+	ror	dig_G,dig_F,32-19
+	mov	dig_F,dig_E
+	eor	dig_E,TT1,TT2,ror(32-9)
+	ror	Tj,Tj,32-1
+.endm
+
+// round 16-62
+.macro sm3_round_16	round:req
+	ldr	msg, [sp,msg_off+4*((\round\())%17)]
+	ldr	msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)]
+	ldr	msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)]
+	add	SS1,dig_E,Tj
+	ror	TT1,dig_A,32-12
+	add	SS1,SS1,TT1
+	ror	SS1,SS1,32-7	//SS1 done
+	eor	SS2,SS1,TT1	//SS2 done
+
+	eor	msg0,msg0,msg1
+	ldr	msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)]
+	orr	TT1,dig_B,dig_C
+	and	tmp0,dig_B,dig_C
+
+	eor	TT2,dig_F,dig_G
+	and	TT1,TT1,dig_A
+	add	SS2,SS2,dig_D
+	orr	TT1,TT1,tmp0
+	and	TT2,TT2,dig_E
+	add	SS1,SS1,msg
+	eor	TT2,TT2,dig_G
+
+	eor	msg0,msg0,msg2,ror (32-15)
+	ldr	msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)]
+	ldr	msg4,[sp,msg_off+4*((\round\()+4 -  6)%17)]
+	eor	msg1,msg0,msg0,ror (32 -15)
+	add	TT1,TT1,SS2
+	eor	msg4,msg4,msg3, ror (32-7)
+	eor	msg0,msg1,msg0, ror (32-23)
+	add	SS1,SS1,dig_H
+	eor	msg0,msg0,msg4
+	add	TT2,TT2,SS1
+	mov	dig_D,dig_C
+	str	msg0,[sp,msg_off+4*((\round\()+4)%17)]
+	eor	msgP,msg,msg0
+	add	TT1,TT1,msgP
+	ror	dig_C,dig_B,32-9
+	mov	dig_B,dig_A
+	mov	dig_A,TT1
+	eor	TT1,TT2,TT2,ror (32-17)
+	mov	dig_H,dig_G
+	ror	dig_G,dig_F,32-19
+	mov	dig_F,dig_E
+	eor	dig_E,TT1,TT2,ror(32-9)
+	ror	Tj,Tj,32-1
+.endm
+
+//round 63
+.macro sm3_round_63	round:req
+	ldr	msg, [sp,msg_off+4*((\round\())%17)]
+	ldr	msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)]
+	ldr	msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)]
+	add	SS1,dig_E,Tj
+	ror	TT1,dig_A,32-12
+	add	SS1,SS1,TT1
+	ror	SS1,SS1,32-7	//SS1 done
+	eor	SS2,SS1,TT1	//SS2 done
+	eor	msg0,msg0,msg1
+	ldr	msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)]
+	orr	TT1,dig_B,dig_C
+	and	tmp0,dig_B,dig_C
+	eor	TT2,dig_F,dig_G
+	and	TT1,TT1,dig_A
+	add	SS2,SS2,dig_D
+	orr	TT1,TT1,tmp0
+	and	TT2,TT2,dig_E
+	add	SS1,SS1,msg
+	eor	TT2,TT2,dig_G
+	eor	msg0,msg0,msg2,ror (32-15)
+	ldr	msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)]
+	ldr	msg4,[sp,msg_off+4*((\round\()+4 -  6)%17)]
+	eor	msg1,msg0,msg0,ror (32 -15)
+	add	TT1,TT1,SS2
+	eor	msg4,msg4,msg3, ror (32-7)
+	eor	msg0,msg1,msg0, ror (32-23)
+	add	SS1,SS1,dig_H
+	eor	msg0,msg0,msg4
+	add	TT2,TT2,SS1
+	str	msg0,[sp,msg_off+4*((\round\()+4)%17)]
+	eor	msgP,msg,msg0
+	add	TT1,TT1,msgP
+	ins	vdig0_bak.s[3],dig_C
+	ror	dig_C,dig_B,32-9
+	ins	vdig0_bak.s[1],dig_A
+	ins	vdig0_bak.s[0],TT1
+	ins	vdig0_bak.s[2],dig_C
+	eor	TT1,TT2,TT2,ror (32-17)
+	ins	vdig1_bak.s[3],dig_G
+	ror	dig_G,dig_F,32-19
+	ins	vdig1_bak.s[1],dig_E
+	ins	vdig1_bak.s[2],dig_G
+	eor	dig_E,TT1,TT2,ror(32-9)
+	ins	vdig1_bak.s[0],dig_E
+.endm
+
+	.set	wp_off , 96
+	.set	msg_off, 96 + 12*4
+#define STACK_SIZE	224
+	.global	sm3_mb_asimd_x1
+	.type	sm3_mb_asimd_x1, %function
+sm3_mb_asimd_x1:
+	stp	x29,x30, [sp,-STACK_SIZE]!
+	cmp	len,0
+	ldr	data,[job],64
+	ldp	qdig0,qdig1,[digest]
+	stp	x19, x20, [sp, 16]
+	stp	x21, x22, [sp, 32]
+	rev32	vdig0.16b,vdig0.16b
+	stp	x23, x24, [sp, 48]
+	rev32	vdig1.16b,vdig1.16b
+	stp	x25, x26, [sp, 64]
+	stp	x27, x28, [sp, 80]
+	ble	.exit_func
+
+.start_loop:
+
+	/** prepare first 12 round data **/
+	ld1	{vvect_msg0.16b-vvect_msg3.16b},[data],64
+	mov 	Tj, 17689
+	umov	dig_A,vdig0.s[0]
+	movk	Tj, 0x79cc, lsl 16
+	rev32	vvect_msg0.16b,vvect_msg0.16b
+	umov	dig_B,vdig0.s[1]
+	rev32	vvect_msg1.16b,vvect_msg1.16b
+	umov	dig_C,vdig0.s[2]
+	rev32	vvect_msg2.16b,vvect_msg2.16b
+	umov	dig_D,vdig0.s[3]
+	rev32	vvect_msg3.16b,vvect_msg3.16b
+	umov	dig_E,vdig1.s[0]
+	stp	qvect_msg0,qvect_msg1,[sp,msg_off]
+	umov	dig_F,vdig1.s[1]
+	stp	qvect_msg2,qvect_msg3,[sp,msg_off+32]
+	umov	dig_G,vdig1.s[2]
+	eor	vvect_msgP0.16b,vvect_msg0.16b,vvect_msg1.16b
+	eor	vvect_msgP1.16b,vvect_msg1.16b,vvect_msg2.16b
+	umov	dig_H,vdig1.s[3]
+	stp	qvect_msgP0,qvect_msgP1,[sp,wp_off]
+	eor	vvect_msgP2.16b,vvect_msg2.16b,vvect_msg3.16b
+	str	qvect_msgP2,[sp,wp_off+32]
+
+	sm3_round_0	 0
+	sm3_round_0	 1
+	sm3_round_0	 2
+	sm3_round_0	 3
+	sm3_round_0	 4
+	sm3_round_0	 5
+	sm3_round_0	 6
+	sm3_round_0	 7
+	sm3_round_0	 8
+	sm3_round_0	 9
+	sm3_round_0	10
+	sm3_round_0	11
+
+	sm3_round_12	12
+	sm3_round_12	13
+	sm3_round_12	14
+	sm3_round_12	15
+	mov	Tj, 0x7a87
+	movk	Tj, 0x9d8a, lsl 16
+	sm3_round_16	16
+	sm3_round_16	17
+	sm3_round_16	18
+	sm3_round_16	19
+	sm3_round_16	20
+	sm3_round_16	21
+	sm3_round_16	22
+	sm3_round_16	23
+	sm3_round_16	24
+	sm3_round_16	25
+	sm3_round_16	26
+	sm3_round_16	27
+	sm3_round_16	28
+	sm3_round_16	29
+	sm3_round_16	30
+	sm3_round_16	31
+	sm3_round_16	32
+	sm3_round_16	33
+	sm3_round_16	34
+	sm3_round_16	35
+	sm3_round_16	36
+	sm3_round_16	37
+	sm3_round_16	38
+	sm3_round_16	39
+	sm3_round_16	40
+	sm3_round_16	41
+	sm3_round_16	42
+	sm3_round_16	43
+	sm3_round_16	44
+	sm3_round_16	45
+	sm3_round_16	46
+	sm3_round_16	47
+	sm3_round_16	48
+	sm3_round_16	49
+	sm3_round_16	50
+	sm3_round_16	51
+	sm3_round_16	52
+	sm3_round_16	53
+	sm3_round_16	54
+	sm3_round_16	55
+	sm3_round_16	56
+	sm3_round_16	57
+	sm3_round_16	58
+	sm3_round_16	59
+	sm3_round_16	60
+	sm3_round_16	61
+	sm3_round_16	62
+	sm3_round_63	63
+	subs		len,len,1
+	eor		vdig0.16b,vdig0.16b,vdig0_bak.16b
+	eor		vdig1.16b,vdig1.16b,vdig1_bak.16b
+	bne	.start_loop
+.exit_func:
+	ldp	x19, x20, [sp, 16]
+	rev32	vdig0.16b,vdig0.16b
+	ldp	x21, x22, [sp, 32]
+	rev32	vdig1.16b,vdig1.16b
+	ldp	x23, x24, [sp, 48]
+	stp	qdig0,qdig1,[digest]
+	ldp	x25, x26, [sp, 64]
+	ldp	x27, x28, [sp, 80]
+	ldp	x29, x30, [sp], STACK_SIZE
+	ret
+	.size	sm3_mb_asimd_x1, .-sm3_mb_asimd_x1
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S
new file mode 100644
index 000000000..975a07c7a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S
@@ -0,0 +1,576 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE
+  OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8.2-a
+	.text
+	.align	2
+	.p2align 3,,7
+
+.macro	declare_var_vector_reg name:req,reg:req
+	q\name\()	.req	q\reg
+	v\name\()	.req	v\reg
+	s\name\()	.req	s\reg
+.endm
+
+	job0		.req	x0
+	job1		.req	x1
+	job2		.req	x2
+	job3		.req	x3
+	len		.req	x4
+
+	job0_data	.req	x5
+	job1_data	.req	x6
+	job2_data	.req	x7
+	job3_data	.req	x9
+
+	job0_digest	.req	x0
+	job1_digest	.req	x1
+	job2_digest	.req	x2
+	job3_digest	.req	x3
+	job0_tmp	.req	x10
+	job1_tmp	.req	x11
+	job2_tmp	.req	x12
+	job3_tmp	.req	x13
+	const_adr	.req	x14
+
+
+	declare_var_vector_reg	msg0,0
+	declare_var_vector_reg	msg1,1
+	declare_var_vector_reg	msg2,2
+	declare_var_vector_reg	msg3,3
+	declare_var_vector_reg	msg4,4
+	declare_var_vector_reg	msg5,5
+	declare_var_vector_reg	msg6,6
+	declare_var_vector_reg	msg7,7
+	declare_var_vector_reg	msg8,8
+	declare_var_vector_reg	msg9,9
+	declare_var_vector_reg	msg10,10
+	declare_var_vector_reg	msg11,11
+	declare_var_vector_reg	msg12,12
+	declare_var_vector_reg	msg13,13
+	declare_var_vector_reg	msg14,14
+	declare_var_vector_reg	msg15,15
+	declare_var_vector_reg	msg16,16
+
+
+	declare_var_vector_reg	dig_A,24
+	declare_var_vector_reg	dig_B,25
+	declare_var_vector_reg	dig_C,26
+	declare_var_vector_reg	dig_D,27
+	declare_var_vector_reg	dig_E,28
+	declare_var_vector_reg	dig_F,29
+	declare_var_vector_reg	dig_G,30
+	declare_var_vector_reg	dig_H,31
+
+	declare_var_vector_reg	TT1,17
+	declare_var_vector_reg	TT2,18
+	declare_var_vector_reg	SS1,19
+	declare_var_vector_reg	SS2,20
+	declare_var_vector_reg	tmp0,21
+	declare_var_vector_reg	word_pair,23
+	declare_var_vector_reg	Tj,22
+
+
+.macro rol32	target:req,reg:req,bit:req
+	ushr	v\target\().4s,v\reg\().4s,32 - \bit
+	sli	v\target\().4s,v\reg\().4s,\bit
+.endm
+
+// round 0-11
+.macro sm3_round_0	round:req,wp:req
+
+	ushr	vtmp0.4s,vdig_A.4s,32 - 12
+
+	add	vSS1.4s,vdig_E.4s,vTj.4s
+	sli	vtmp0.4s,vdig_A.4s,12
+	rev32	vmsg\round\().16b,vmsg\round\().16b
+	rev32	vmsg\wp\().16b,vmsg\wp\().16b
+	add	vTT1.4s,vSS1.4s,vtmp0.4s	//SS1 Done
+	rol32	SS1,TT1,7
+	eor	vSS2.16b,vSS1.16b,vtmp0.16b	//SS2 Done
+	eor	vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b
+
+	eor	vTT1.16b,vdig_A.16b,vdig_B.16b
+	eor	vTT2.16b,vdig_E.16b,vdig_F.16b
+	eor	vTT1.16b,vTT1.16b,vdig_C.16b
+	eor	vTT2.16b,vTT2.16b,vdig_G.16b
+
+	add	vSS1.4s,vSS1.4s,vmsg\round\().4s
+	add	vSS2.4s,vSS2.4s,vword_pair.4s
+	add	vTT1.4s,vTT1.4s,vdig_D.4s
+	add	vTT2.4s,vTT2.4s,vdig_H.4s
+	ushr	vtmp0.4s,vTj.4s,32-1
+	add	vTT1.4s,vTT1.4s,vSS2.4s	//TT1 Done
+	sli	vtmp0.4s,vTj.4s,1
+	add	vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+	mov	vTj.16b,vtmp0.16b
+	//D=C
+	mov	vdig_D.16b,vdig_C.16b
+	//C = ROTL32(B, 9);
+	ushr	vdig_C.4s,vdig_B.4s,32 - 9
+	sli	vdig_C.4s,vdig_B.4s,9
+	//B=A
+	mov	vdig_B.16b,vdig_A.16b
+	//A=TT1
+	mov	vdig_A.16b,vTT1.16b
+	//	H=G
+	mov	vdig_H.16b,vdig_G.16b
+	//G = ROTL32(F,19)
+	rol32	dig_G,dig_F,19
+	//F = E
+	mov	vdig_F.16b,vdig_E.16b
+	// E=Target, TT2=src, TT1,SS1,SS2 is free
+	// E = P0(TT2);
+	ushr	vSS2.4s, vTT2.4s, 32 - 9
+	ushr	vSS1.4s, vTT2.4s, 32 - 17
+	sli	vSS2.4s, vTT2.4s, 9
+	sli	vSS1.4s, vTT2.4s, 17
+	eor	vdig_E.16b, vTT2.16b, vSS1.16b
+	eor	vdig_E.16b, vdig_E.16b, vSS2.16b
+
+.endm
+
+
+.macro sm3_round_4	round:req,wp:req
+
+	ushr	vtmp0.4s,vdig_A.4s,32 - 12
+	add	vSS1.4s,vdig_E.4s,vTj.4s
+	sli	vtmp0.4s,vdig_A.4s,12
+	rev32	vmsg\wp\().16b,vmsg\wp\().16b
+	add	vTT1.4s,vSS1.4s,vtmp0.4s	//SS1 Done
+	rol32	SS1,TT1,7
+	eor	vSS2.16b,vSS1.16b,vtmp0.16b	//SS2 Done
+	eor	vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b
+	eor	vTT1.16b,vdig_A.16b,vdig_B.16b
+	eor	vTT2.16b,vdig_E.16b,vdig_F.16b
+	eor	vTT1.16b,vTT1.16b,vdig_C.16b
+	eor	vTT2.16b,vTT2.16b,vdig_G.16b
+	add	vSS1.4s,vSS1.4s,vmsg\round\().4s
+	add	vSS2.4s,vSS2.4s,vword_pair.4s
+	add	vTT1.4s,vTT1.4s,vdig_D.4s
+	add	vTT2.4s,vTT2.4s,vdig_H.4s
+	ushr	vtmp0.4s,vTj.4s,32-1
+	add	vTT1.4s,vTT1.4s,vSS2.4s	//TT1 Done
+	sli	vtmp0.4s,vTj.4s,1
+	add	vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+	mov	vTj.16b,vtmp0.16b
+	//D=C
+	mov	vdig_D.16b,vdig_C.16b
+	//C = ROTL32(B, 9);
+	ushr	vdig_C.4s,vdig_B.4s,32 - 9
+	sli	vdig_C.4s,vdig_B.4s,9
+	//B=A
+	mov	vdig_B.16b,vdig_A.16b
+	//A=TT1
+	mov	vdig_A.16b,vTT1.16b
+	//	H=G
+	mov	vdig_H.16b,vdig_G.16b
+	//G = ROTL32(F,19)
+	rol32	dig_G,dig_F,19
+	//F = E
+	mov	vdig_F.16b,vdig_E.16b
+	// E=Target, TT2=src, TT1,SS1,SS2 is free
+	// E = P0(TT2);
+	ushr	vSS2.4s, vTT2.4s, 32 - 9
+	ushr	vSS1.4s, vTT2.4s, 32 - 17
+	sli	vSS2.4s, vTT2.4s, 9
+	sli	vSS1.4s, vTT2.4s, 17
+	eor	vdig_E.16b, vTT2.16b, vSS1.16b
+	eor	vdig_E.16b, vdig_E.16b, vSS2.16b
+
+.endm
+
+//round 12-15
+.macro sm3_round_12	round:req,plus_4:req,m0,m1,m2,m3,m4
+	rol32	msg\plus_4,msg\m2,15
+	eor	vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b
+	eor	vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b
+	rol32	tmp0,msg\plus_4,15
+	rol32	word_pair,msg\plus_4,23
+	eor	vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+	eor	vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b
+	rol32	tmp0,msg\m3,7
+	eor	vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b
+	eor	vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+	ushr	vtmp0.4s,vdig_A.4s,32 - 12
+	sli	vtmp0.4s,vdig_A.4s,12
+	add	vSS1.4s,vdig_E.4s,vTj.4s
+	add	vSS2.4s,vSS1.4s,vtmp0.4s	//SS1 Done
+	rol32	SS1,SS2,7
+	eor	vSS2.16b,vSS1.16b,vtmp0.16b	//SS2 Done
+	eor	vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b
+	eor	vTT1.16b,vdig_A.16b,vdig_B.16b
+	eor	vTT1.16b,vTT1.16b,vdig_C.16b
+	eor	vTT2.16b,vdig_E.16b,vdig_F.16b
+	eor	vTT2.16b,vTT2.16b,vdig_G.16b
+	add	vSS1.4s,vSS1.4s,vmsg\round\().4s
+	add	vSS2.4s,vSS2.4s,vword_pair.4s
+	add	vTT1.4s,vTT1.4s,vdig_D.4s
+	add	vTT2.4s,vTT2.4s,vdig_H.4s
+	ushr	vtmp0.4s,vTj.4s,32-1
+	add	vTT1.4s,vTT1.4s,vSS2.4s	//TT1 Done
+	sli	vtmp0.4s,vTj.4s,1
+	add	vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+	mov	vTj.16b,vtmp0.16b
+	//D=C
+	mov	vdig_D.16b,vdig_C.16b
+	//C = ROTL32(B, 9);
+	ushr	vdig_C.4s,vdig_B.4s,32 - 9
+	sli	vdig_C.4s,vdig_B.4s,9
+	//B=A
+	mov	vdig_B.16b,vdig_A.16b
+	//A=TT1
+	mov	vdig_A.16b,vTT1.16b
+	//	H=G
+	mov	vdig_H.16b,vdig_G.16b
+	//G = ROTL32(F,19)
+	rol32	dig_G,dig_F,19
+	//F = E
+	mov	vdig_F.16b,vdig_E.16b
+	// E=Target, TT2=src, TT1,SS1,SS2 is free
+	// E = P0(TT2);
+	ushr	vSS2.4s, vTT2.4s, 32 - 9
+	ushr	vSS1.4s, vTT2.4s, 32 - 17
+	sli	vSS2.4s, vTT2.4s, 9
+	sli	vSS1.4s, vTT2.4s, 17
+	eor	vdig_E.16b, vTT2.16b, vSS1.16b
+	eor	vdig_E.16b, vdig_E.16b, vSS2.16b
+.endm
+
+// round 16-62
+.macro sm3_round_16	round:req,plus_4:req,m0,m1,m2,m3,m4
+	rol32	msg\plus_4,msg\m2,15
+	eor	vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b
+	eor	vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b
+	rol32	tmp0,msg\plus_4,15
+	rol32	word_pair,msg\plus_4,23
+	eor	vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+	eor	vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b
+	rol32	tmp0,msg\m3,7
+	eor	vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b
+	eor	vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+	ushr	vtmp0.4s,vdig_A.4s,32 - 12
+	sli	vtmp0.4s,vdig_A.4s,12
+	add	vSS1.4s,vdig_E.4s,vTj.4s
+	add	vSS2.4s,vSS1.4s,vtmp0.4s	//SS1 Done
+	rol32	SS1,SS2,7
+	eor	vSS2.16b,vSS1.16b,vtmp0.16b	//SS2 Done
+	eor	vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b
+	mov	vTT2.16b,vdig_E.16b
+	orr	vTT1.16b,vdig_B.16b,vdig_C.16b
+	and	vtmp0.16b,vdig_B.16b,vdig_C.16b
+	bsl	vTT2.16b,vdig_F.16b,vdig_G.16b
+	and	vTT1.16b,vTT1.16b,vdig_A.16b
+	add	vSS1.4s,vSS1.4s,vmsg\round\().4s
+	orr	vTT1.16b,vTT1.16b,vtmp0.16b
+	add	vSS2.4s,vSS2.4s,vword_pair.4s
+	add	vTT1.4s,vTT1.4s,vdig_D.4s
+	add	vTT2.4s,vTT2.4s,vdig_H.4s
+	ushr	vtmp0.4s,vTj.4s,32-1
+	add	vTT1.4s,vTT1.4s,vSS2.4s	//TT1 Done
+	sli	vtmp0.4s,vTj.4s,1
+	add	vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+	mov	vTj.16b,vtmp0.16b
+	//D=C
+	mov	vdig_D.16b,vdig_C.16b
+	//C = ROTL32(B, 9);
+	ushr	vdig_C.4s,vdig_B.4s,32 - 9
+	sli	vdig_C.4s,vdig_B.4s,9
+	//B=A
+	mov	vdig_B.16b,vdig_A.16b
+	//A=TT1
+	mov	vdig_A.16b,vTT1.16b
+	//	H=G
+	mov	vdig_H.16b,vdig_G.16b
+	//G = ROTL32(F,19)
+	rol32	dig_G,dig_F,19
+	//F = E
+	mov	vdig_F.16b,vdig_E.16b
+	// E=Target, TT2=src, TT1,SS1,SS2 is free
+	// E = P0(TT2);
+	ushr	vSS2.4s, vTT2.4s, 32 - 9
+	ushr	vSS1.4s, vTT2.4s, 32 - 17
+	sli	vSS2.4s, vTT2.4s, 9
+	sli	vSS1.4s, vTT2.4s, 17
+	eor	vdig_E.16b, vTT2.16b, vSS1.16b
+	eor	vdig_E.16b, vdig_E.16b, vSS2.16b
+.endm
+
+//round 63
+.macro sm3_round_63	round:req,plus_4:req,m0,m1,m2,m3,m4
+	rol32	msg\plus_4,msg\m2,15
+	eor	vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b
+	eor	vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b
+	rol32	tmp0,msg\plus_4,15
+	rol32	word_pair,msg\plus_4,23
+	eor	vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+	eor	vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b
+	rol32	tmp0,msg\m3,7
+	eor	vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b
+	eor	vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+	ushr	vtmp0.4s,vdig_A.4s,32 - 12
+	sli	vtmp0.4s,vdig_A.4s,12
+	add	vSS1.4s,vdig_E.4s,vTj.4s
+	add	vSS2.4s,vSS1.4s,vtmp0.4s	//SS1 Done
+	rol32	SS1,SS2,7
+	eor	vSS2.16b,vSS1.16b,vtmp0.16b	//SS2 Done
+	eor	vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b
+
+	ldp	qmsg0,qmsg1,[sp,dig_off+   0]
+	mov	vTT2.16b,vdig_E.16b
+	ldp	qmsg2,qmsg3,[sp,dig_off+  32]
+	orr	vTT1.16b,vdig_B.16b,vdig_C.16b
+	ldp	qmsg4,qmsg5,[sp,dig_off+  64]
+	and	vtmp0.16b,vdig_B.16b,vdig_C.16b
+	bsl	vTT2.16b,vdig_F.16b,vdig_G.16b
+	ldp	qmsg6,qmsg7,[sp,dig_off+  96]
+	and	vTT1.16b,vTT1.16b,vdig_A.16b
+	add	vSS1.4s,vSS1.4s,vmsg\round\().4s
+	orr	vTT1.16b,vTT1.16b,vtmp0.16b
+	add	vSS2.4s,vSS2.4s,vword_pair.4s
+	add	vTT1.4s,vTT1.4s,vdig_D.4s
+	add	vTT2.4s,vTT2.4s,vdig_H.4s
+	add	vTT1.4s,vTT1.4s,vSS2.4s	//TT1 Done
+	add	vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+	//D=C
+	eor	vdig_D.16b,vdig_C.16b,vmsg3.16b
+	//C = ROTL32(B, 9);
+	ushr	vdig_C.4s,vdig_B.4s,32 - 9
+	sli	vdig_C.4s,vdig_B.4s,9
+	eor	vdig_C.16b,vdig_C.16b,vmsg2.16b
+	//B=A
+	eor	vdig_B.16b,vdig_A.16b,vmsg1.16b
+	stp	qdig_C,qdig_D,[sp,dig_off+  32]
+	//A=TT1
+	eor	vdig_A.16b,vTT1.16b,vmsg0.16b
+	//	H=G
+	eor	vdig_H.16b,vdig_G.16b,vmsg7.16b
+	stp	qdig_A,qdig_B,[sp,dig_off+   0]
+	//G = ROTL32(F,19)
+	rol32	dig_G,dig_F,19
+	eor	vdig_G.16b,vdig_G.16b,vmsg6.16b
+	//F = E
+	eor	vdig_F.16b,vdig_E.16b,vmsg5.16b
+	stp	qdig_G,qdig_H,[sp,dig_off+  96]
+	// E=Target, TT2=src, TT1,SS1,SS2 is free
+	// E = P0(TT2);
+	ushr	vSS2.4s, vTT2.4s, 32 - 9
+	ushr	vSS1.4s, vTT2.4s, 32 - 17
+	sli	vSS2.4s, vTT2.4s, 9
+	sli	vSS1.4s, vTT2.4s, 17
+	eor	vdig_E.16b, vTT2.16b, vSS1.16b
+	eor	vdig_E.16b, vdig_E.16b, vSS2.16b
+	eor	vdig_E.16b, vdig_E.16b, vmsg4.16b
+	stp	qdig_E,qdig_F,[sp,dig_off+  64]
+.endm
+
+	.set	dig_off , 80
+
+#define STACK_SIZE	224
+	.global	sm3_mb_asimd_x4
+	.type	sm3_mb_asimd_x4, %function
+sm3_mb_asimd_x4:
+	stp	x29,x30, [sp,-STACK_SIZE]!
+	cmp	len,0
+	//push d8~d15
+	ldr	job0_data, [job0],64
+	stp 	d8,d9,  [sp,16]
+	ldr	job1_data, [job1],64
+	stp 	d10,d11,[sp,32]
+	ldr	job2_data, [job2],64
+	stp 	d12,d13,[sp,48]
+	ldr	job3_data, [job3],64
+	stp 	d14,d15,[sp,64]
+	ble	.exit_func
+
+	mov	job0_tmp,job0_digest
+	mov	job1_tmp,job1_digest
+	mov	job2_tmp,job2_digest
+	mov	job3_tmp,job3_digest
+	//load digests
+	ld4	{vdig_A.s-vdig_D.s}[0],[job0_tmp],16
+	ld4	{vdig_A.s-vdig_D.s}[1],[job1_tmp],16
+	ld4	{vdig_A.s-vdig_D.s}[2],[job2_tmp],16
+	adrp	const_adr, .consts
+	ld4	{vdig_A.s-vdig_D.s}[3],[job3_tmp],16
+	add	const_adr, const_adr, #:lo12:.consts
+	ld4	{vdig_E.s-vdig_H.s}[0],[job0_tmp]
+	rev32	vdig_A.16b,vdig_A.16b
+	ld4	{vdig_E.s-vdig_H.s}[1],[job1_tmp]
+	rev32	vdig_B.16b,vdig_B.16b
+	ld4	{vdig_E.s-vdig_H.s}[2],[job2_tmp]
+	rev32	vdig_C.16b,vdig_C.16b
+	ld4	{vdig_E.s-vdig_H.s}[3],[job3_tmp]
+	rev32	vdig_D.16b,vdig_D.16b
+	stp	qdig_A,qdig_B,[sp,dig_off+  0]
+	rev32	vdig_E.16b,vdig_E.16b
+	rev32	vdig_F.16b,vdig_F.16b
+	stp	qdig_C,qdig_D,[sp,dig_off+ 32]
+	rev32	vdig_G.16b,vdig_G.16b
+	rev32	vdig_H.16b,vdig_H.16b
+	stp	qdig_E,qdig_F,[sp,dig_off+ 64]
+	stp	qdig_G,qdig_H,[sp,dig_off+ 96]
+
+.start_loop:
+	ld4	{vmsg0.s-vmsg3.s}[0],[job0_data],16
+	ld4	{vmsg0.s-vmsg3.s}[1],[job1_data],16
+	ld4	{vmsg0.s-vmsg3.s}[2],[job2_data],16
+	ld4	{vmsg0.s-vmsg3.s}[3],[job3_data],16
+	ld4	{vmsg4.s-vmsg7.s}[0],[job0_data],16
+	ld4	{vmsg4.s-vmsg7.s}[1],[job1_data],16
+	ld4	{vmsg4.s-vmsg7.s}[2],[job2_data],16
+	ld4	{vmsg4.s-vmsg7.s}[3],[job3_data],16
+	ld4	{vmsg8.s-vmsg11.16b}[0],[job0_data],16
+	ldr	qTj,[const_adr]
+
+	sm3_round_0	 0, 4
+
+	ld4	{vmsg8.s-vmsg11.s}[1],[job1_data],16
+	sm3_round_0	 1, 5
+
+	ld4	{vmsg8.s-vmsg11.s}[2],[job2_data],16
+	sm3_round_0	 2, 6
+	ld4	{vmsg8.s-vmsg11.s}[3],[job3_data],16
+	sm3_round_0	 3, 7
+
+	ld4	{vmsg12.s-vmsg15.s}[0],[job0_data],16
+
+	sm3_round_4	 4, 8
+	ld4	{vmsg12.s-vmsg15.s}[1],[job1_data],16
+	sm3_round_4	 5, 9
+	ld4	{vmsg12.s-vmsg15.s}[2],[job2_data],16
+	sm3_round_4	 6,10
+	ld4	{vmsg12.s-vmsg15.s}[3],[job3_data],16
+	sm3_round_4	 7,11
+	sm3_round_4	 8,12
+	sm3_round_4	 9,13
+	sm3_round_4	10,14
+	sm3_round_4	11,15
+
+	sm3_round_12	12,16, 0, 7,13, 3,10 //12
+	sm3_round_12	13, 0, 1, 8,14, 4,11 //13
+	sm3_round_12	14, 1, 2, 9,15, 5,12 //14
+	sm3_round_12	15, 2, 3,10,16, 6,13 //15
+
+	ldr	qTj,[const_adr,16]
+	sm3_round_16	16, 3, 4,11, 0, 7,14 //16
+#if 0
+	stp	sdig_A,sdig_B,[job0_digest]
+	stp	sdig_C,sdig_D,[job0_digest,8]
+	stp	sdig_E,sdig_F,[job0_digest,16]
+	stp	sdig_G,sdig_H,[job0_digest,24]
+	b .exit_func
+#endif
+	sm3_round_16	 0, 4, 5,12, 1, 8,15 //17
+
+	sm3_round_16	 1, 5, 6,13, 2, 9,16 //18
+	sm3_round_16	 2, 6, 7,14, 3,10, 0 //19
+	sm3_round_16	 3, 7, 8,15, 4,11, 1 //20
+	sm3_round_16	 4, 8, 9,16, 5,12, 2 //21
+	sm3_round_16	 5, 9,10, 0, 6,13, 3 //22
+	sm3_round_16	 6,10,11, 1, 7,14, 4 //23
+	sm3_round_16	 7,11,12, 2, 8,15, 5 //24
+	sm3_round_16	 8,12,13, 3, 9,16, 6 //25
+	sm3_round_16	 9,13,14, 4,10, 0, 7 //26
+	sm3_round_16	10,14,15, 5,11, 1, 8 //27
+	sm3_round_16	11,15,16, 6,12, 2, 9 //28
+	sm3_round_16	12,16, 0, 7,13, 3,10 //29
+	sm3_round_16	13, 0, 1, 8,14, 4,11 //30
+	sm3_round_16	14, 1, 2, 9,15, 5,12 //31
+	sm3_round_16	15, 2, 3,10,16, 6,13 //32
+	sm3_round_16	16, 3, 4,11, 0, 7,14 //33
+	sm3_round_16	 0, 4, 5,12, 1, 8,15 //34
+	sm3_round_16	 1, 5, 6,13, 2, 9,16 //35
+	sm3_round_16	 2, 6, 7,14, 3,10, 0 //36
+	sm3_round_16	 3, 7, 8,15, 4,11, 1 //37
+	sm3_round_16	 4, 8, 9,16, 5,12, 2 //38
+	sm3_round_16	 5, 9,10, 0, 6,13, 3 //39
+	sm3_round_16	 6,10,11, 1, 7,14, 4 //40
+	sm3_round_16	 7,11,12, 2, 8,15, 5 //41
+	sm3_round_16	 8,12,13, 3, 9,16, 6 //42
+	sm3_round_16	 9,13,14, 4,10, 0, 7 //43
+	sm3_round_16	10,14,15, 5,11, 1, 8 //44
+	sm3_round_16	11,15,16, 6,12, 2, 9 //45
+	sm3_round_16	12,16, 0, 7,13, 3,10 //46
+	sm3_round_16	13, 0, 1, 8,14, 4,11 //47
+	sm3_round_16	14, 1, 2, 9,15, 5,12 //48
+	sm3_round_16	15, 2, 3,10,16, 6,13 //49
+	sm3_round_16	16, 3, 4,11, 0, 7,14 //50
+	sm3_round_16	 0, 4, 5,12, 1, 8,15 //51
+	sm3_round_16	 1, 5, 6,13, 2, 9,16 //52
+	sm3_round_16	 2, 6, 7,14, 3,10, 0 //53
+	sm3_round_16	 3, 7, 8,15, 4,11, 1 //54
+	sm3_round_16	 4, 8, 9,16, 5,12, 2 //55
+	sm3_round_16	 5, 9,10, 0, 6,13, 3 //56
+	sm3_round_16	 6,10,11, 1, 7,14, 4 //57
+	sm3_round_16	 7,11,12, 2, 8,15, 5 //58
+	sm3_round_16	 8,12,13, 3, 9,16, 6 //59
+	sm3_round_16	 9,13,14, 4,10, 0, 7 //60
+	sm3_round_16	10,14,15, 5,11, 1, 8 //61
+	sm3_round_16	11,15,16, 6,12, 2, 9 //62
+	sm3_round_63	12,16, 0, 7,13, 3,10 //63
+
+	subs		len,len,1
+	bne		.start_loop
+
+	//save digests with big endian
+	rev32	vdig_A.16b,vdig_A.16b
+	rev32	vdig_B.16b,vdig_B.16b
+	rev32	vdig_C.16b,vdig_C.16b
+	rev32	vdig_D.16b,vdig_D.16b
+	st4	{vdig_A.s-vdig_D.s}[0],[job0_digest],16
+	rev32	vdig_E.16b,vdig_E.16b
+	rev32	vdig_F.16b,vdig_F.16b
+	st4	{vdig_A.s-vdig_D.s}[1],[job1_digest],16
+	rev32	vdig_G.16b,vdig_G.16b
+	rev32	vdig_H.16b,vdig_H.16b
+	st4	{vdig_A.s-vdig_D.s}[2],[job2_digest],16
+	st4	{vdig_A.s-vdig_D.s}[3],[job3_digest],16
+	st4	{vdig_E.s-vdig_H.s}[0],[job0_digest]
+	st4	{vdig_E.s-vdig_H.s}[1],[job1_digest]
+	st4	{vdig_E.s-vdig_H.s}[2],[job2_digest]
+	st4	{vdig_E.s-vdig_H.s}[3],[job3_digest]
+
+.exit_func:
+	ldp     d8, d9, [sp,16]
+	ldp 	d10,d11,[sp,32]
+	ldp 	d12,d13,[sp,48]
+	ldp 	d14,d15,[sp,64]
+	ldp	x29, x30, [sp], STACK_SIZE
+	ret
+.consts:
+	.word	0x79cc4519
+	.word	0x79cc4519
+	.word	0x79cc4519
+	.word	0x79cc4519
+	.word	0x9d8a7a87
+	.word	0x9d8a7a87
+	.word	0x9d8a7a87
+	.word	0x9d8a7a87
+	.size	sm3_mb_asimd_x4, .-sm3_mb_asimd_x4
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c
new file mode 100644
index 000000000..6e1dff45e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c
@@ -0,0 +1,246 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sm3_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+#define SM3_LOG2_BLOCK_SIZE 6
+void sm3_mb_mgr_init_asimd(SM3_MB_JOB_MGR * state);
+SM3_JOB *sm3_mb_mgr_submit_asimd(SM3_MB_JOB_MGR * state, SM3_JOB * job);
+SM3_JOB *sm3_mb_mgr_flush_asimd(SM3_MB_JOB_MGR * state);
+static inline void hash_init_digest(SM3_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len);
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx);
+
+void sm3_ctx_mgr_init_asimd(SM3_HASH_CTX_MGR * mgr)
+{
+	sm3_mb_mgr_init_asimd(&mgr->mgr);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_submit_asimd(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+				       const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_fixedlen(&ctx->partial_block_buffer
+					[ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sm3_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_flush_asimd(SM3_HASH_CTX_MGR * mgr)
+{
+	SM3_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_asimd(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sm3_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sm3_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SM3_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx)
+{
+	while (ctx) {
+
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_fixedlen(ctx->partial_block_buffer,
+						((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SM3_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SM3_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_asimd(&mgr->mgr,
+									       &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define cpu_to_be32(v) (((v&0xff000000)>>24) | ((v&0xff0000)>>8) | ((v&0xff00)<<8) | ((v&0xff)<<24))
+#else
+#define cpu_to_be32(v)
+#endif
+static inline void hash_init_digest(SM3_WORD_T * digest)
+{
+	static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] =
+	    { cpu_to_be32(0x7380166f), cpu_to_be32(0x4914b2b9),
+		cpu_to_be32(0x172442d7), cpu_to_be32(0xda8a0600),
+		cpu_to_be32(0xa96f30bc), cpu_to_be32(0x163138aa),
+		cpu_to_be32(0xe38dee4d), cpu_to_be32(0xb0fb0e4e)
+	};
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) + 1 +
+	    SM3_PADLENGTHFIELD_SIZE;
+
+#if SM3_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SM3_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c
new file mode 100644
index 000000000..5af9ead38
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c
@@ -0,0 +1,241 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sm3_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+#define SM3_LOG2_BLOCK_SIZE 6
+void sm3_mb_mgr_init_sm(SM3_MB_JOB_MGR * state);
+SM3_JOB *sm3_mb_mgr_submit_sm(SM3_MB_JOB_MGR * state, SM3_JOB * job);
+SM3_JOB *sm3_mb_mgr_flush_sm(SM3_MB_JOB_MGR * state);
+static inline void hash_init_digest(SM3_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len);
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx);
+
+void sm3_ctx_mgr_init_sm(SM3_HASH_CTX_MGR * mgr)
+{
+	sm3_mb_mgr_init_sm(&mgr->mgr);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_submit_sm(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+				    const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_fixedlen(&ctx->partial_block_buffer
+					[ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+
+			ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_sm(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sm3_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_flush_sm(SM3_HASH_CTX_MGR * mgr)
+{
+	SM3_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_sm(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sm3_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sm3_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SM3_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx)
+{
+	while (ctx) {
+
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_fixedlen(ctx->partial_block_buffer,
+						((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SM3_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SM3_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_sm(&mgr->mgr,
+									    &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_sm(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SM3_WORD_T * digest)
+{
+	static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] =
+	    { to_be32(0x7380166f), to_be32(0x4914b2b9),
+		to_be32(0x172442d7), to_be32(0xda8a0600),
+		to_be32(0xa96f30bc), to_be32(0x163138aa),
+		to_be32(0xe38dee4d), to_be32(0xb0fb0e4e)
+	};
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) + 1 +
+	    SM3_PADLENGTHFIELD_SIZE;
+
+#if SM3_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SM3_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c
new file mode 100644
index 000000000..48a0d4d0e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c
@@ -0,0 +1,188 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <sm3_mb.h>
+#include <assert.h>
+
+#ifndef max
+#define max(a,b)            (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b)            (((a) < (b)) ? (a) : (b))
+#endif
+
+#define SM3_MB_CE_MAX_LANES	4
+void sm3_mb_asimd_x4(SM3_JOB *, SM3_JOB *, SM3_JOB *, SM3_JOB *, int);
+void sm3_mb_asimd_x1(SM3_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i)  	\
+	(((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i)  	\
+	(((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define	LANE_IS_FREE(state,i)		\
+	(((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i)	\
+	(((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+void sm3_mb_mgr_init_asimd(SM3_MB_JOB_MGR * state)
+{
+	unsigned int i;
+
+	state->unused_lanes = 0xf;
+	state->num_lanes_inuse = 0;
+	for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) {
+		state->unused_lanes <<= 4;
+		state->unused_lanes |= SM3_MB_CE_MAX_LANES - 1 - i;
+		state->lens[i] = i;
+		state->ldata[i].job_in_lane = 0;
+	}
+
+	//lanes > SM3_MB_CE_MAX_LANES is invalid lane
+	for (; i < SM3_MAX_LANES; i++) {
+		state->lens[i] = 0xf;
+		state->ldata[i].job_in_lane = 0;
+	}
+}
+
+static int sm3_mb_mgr_do_jobs(SM3_MB_JOB_MGR * state)
+{
+	int lane_idx, len, i;
+
+	if (state->num_lanes_inuse == 0) {
+		return -1;
+	}
+	if (state->num_lanes_inuse == 4) {
+		len = min(min(state->lens[0], state->lens[1]),
+			  min(state->lens[2], state->lens[3]));
+		lane_idx = len & 0xf;
+		len &= ~0xf;
+		sm3_mb_asimd_x4(state->ldata[0].job_in_lane,
+				state->ldata[1].job_in_lane,
+				state->ldata[2].job_in_lane,
+				state->ldata[3].job_in_lane, len >> 4);
+		//only return the min length job
+		for (i = 0; i < SM3_MAX_LANES; i++) {
+			if (LANE_IS_NOT_FINISHED(state, i)) {
+				state->lens[i] -= len;
+				state->ldata[i].job_in_lane->len -= len;
+				state->ldata[i].job_in_lane->buffer += len << 2;
+			}
+		}
+
+		return lane_idx;
+	} else {
+		for (i = 0; i < SM3_MAX_LANES; i++) {
+			if (LANE_IS_NOT_FINISHED(state, i)) {
+				len = state->lens[i] & (~0xf);
+				sm3_mb_asimd_x1(state->ldata[i].job_in_lane, len >> 4);
+				state->lens[i] -= len;
+				state->ldata[i].job_in_lane->len -= len;
+				state->ldata[i].job_in_lane->buffer += len << 2;
+				return i;
+			}
+		}
+	}
+	return -1;
+
+}
+
+static SM3_JOB *sm3_mb_mgr_free_lane(SM3_MB_JOB_MGR * state)
+{
+	int i;
+	SM3_JOB *ret = NULL;
+
+	for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) {
+		if (LANE_IS_FINISHED(state, i)) {
+
+			state->unused_lanes <<= 4;
+			state->unused_lanes |= i;
+			state->num_lanes_inuse--;
+			ret = state->ldata[i].job_in_lane;
+			ret->status = STS_COMPLETED;
+			state->ldata[i].job_in_lane = NULL;
+			break;
+		}
+	}
+	return ret;
+}
+
+static void sm3_mb_mgr_insert_job(SM3_MB_JOB_MGR * state, SM3_JOB * job)
+{
+	int lane_idx;
+	//add job into lanes
+	lane_idx = state->unused_lanes & 0xf;
+	//fatal error
+	assert(lane_idx < SM3_MB_CE_MAX_LANES);
+	state->lens[lane_idx] = (job->len << 4) | lane_idx;
+	state->ldata[lane_idx].job_in_lane = job;
+	state->unused_lanes >>= 4;
+	state->num_lanes_inuse++;
+}
+
+SM3_JOB *sm3_mb_mgr_submit_asimd(SM3_MB_JOB_MGR * state, SM3_JOB * job)
+{
+#ifndef NDEBUG
+	int lane_idx;
+#endif
+	SM3_JOB *ret;
+
+	//add job into lanes
+	sm3_mb_mgr_insert_job(state, job);
+
+	ret = sm3_mb_mgr_free_lane(state);
+	if (ret != NULL) {
+		return ret;
+	}
+	//submit will wait all lane has data
+	if (state->num_lanes_inuse < SM3_MB_CE_MAX_LANES)
+		return NULL;
+#ifndef NDEBUG
+	lane_idx = sm3_mb_mgr_do_jobs(state);
+	assert(lane_idx != -1);
+#else
+	sm3_mb_mgr_do_jobs(state);
+#endif
+
+	//~ i = lane_idx;
+	ret = sm3_mb_mgr_free_lane(state);
+	return ret;
+}
+
+SM3_JOB *sm3_mb_mgr_flush_asimd(SM3_MB_JOB_MGR * state)
+{
+	SM3_JOB *ret;
+	ret = sm3_mb_mgr_free_lane(state);
+	if (ret) {
+		return ret;
+	}
+
+	sm3_mb_mgr_do_jobs(state);
+	return sm3_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c
new file mode 100644
index 000000000..a7178e0be
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c
@@ -0,0 +1,250 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <sm3_mb.h>
+#include <assert.h>
+
+#ifndef max
+#define max(a,b)            (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b)            (((a) < (b)) ? (a) : (b))
+#endif
+
+#define SM3_MB_CE_MAX_LANES	4
+#if SM3_MB_CE_MAX_LANES >=4
+void sm3_mb_sm_x4(SM3_JOB *, SM3_JOB *, SM3_JOB *, SM3_JOB *, int);
+#endif
+#if SM3_MB_CE_MAX_LANES >=3
+void sm3_mb_sm_x3(SM3_JOB *, SM3_JOB *, SM3_JOB *, int);
+#endif
+#if SM3_MB_CE_MAX_LANES >=2
+void sm3_mb_sm_x2(SM3_JOB *, SM3_JOB *, int);
+#endif
+void sm3_mb_sm_x1(SM3_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i)  	\
+	(((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i)  	\
+	(((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define	LANE_IS_FREE(state,i)		\
+	(((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i)	\
+	(((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+void sm3_mb_mgr_init_sm(SM3_MB_JOB_MGR * state)
+{
+	unsigned int i;
+
+	state->unused_lanes = 0xf;
+	state->num_lanes_inuse = 0;
+	for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) {
+		state->unused_lanes <<= 4;
+		state->unused_lanes |= SM3_MB_CE_MAX_LANES - 1 - i;
+		state->lens[i] = i;
+		state->ldata[i].job_in_lane = 0;
+	}
+
+	//lanes > SM3_MB_CE_MAX_LANES is invalid lane
+	for (; i < SM3_MAX_LANES; i++) {
+		state->lens[i] = 0xf;
+		state->ldata[i].job_in_lane = 0;
+	}
+}
+
+static int sm3_mb_mgr_do_jobs(SM3_MB_JOB_MGR * state)
+{
+	int lane_idx, len, i, lanes;
+
+	int lane_idx_array[SM3_MAX_LANES];
+
+	if (state->num_lanes_inuse == 0) {
+		return -1;
+	}
+#if SM3_MB_CE_MAX_LANES == 4
+	if (state->num_lanes_inuse == 4) {
+		len = min(min(state->lens[0], state->lens[1]),
+			  min(state->lens[2], state->lens[3]));
+		lane_idx = len & 0xf;
+		len &= ~0xf;
+
+		sm3_mb_sm_x4(state->ldata[0].job_in_lane,
+			     state->ldata[1].job_in_lane,
+			     state->ldata[2].job_in_lane,
+			     state->ldata[3].job_in_lane, len >> 4);
+
+	} else
+#elif	SM3_MB_CE_MAX_LANES == 3
+	if (state->num_lanes_inuse == 3) {
+		len = min(min(state->lens[0], state->lens[1]), state->lens[2]);
+		lane_idx = len & 0xf;
+		len &= ~0xf;
+
+		sm3_mb_sm_x3(state->ldata[0].job_in_lane,
+			     state->ldata[1].job_in_lane,
+			     state->ldata[2].job_in_lane, len >> 4);
+
+	} else
+#elif	SM3_MB_CE_MAX_LANES == 2
+	if (state->num_lanes_inuse == 2) {
+		len = min(state->lens[0], state->lens[1]);
+		lane_idx = len & 0xf;
+		len &= ~0xf;
+		sm3_mb_sm_x2(state->ldata[0].job_in_lane,
+			     state->ldata[1].job_in_lane, len >> 4);
+
+	} else
+#endif
+	{
+		lanes = 0, len = 0;
+		for (i = 0; i < SM3_MAX_LANES && lanes < state->num_lanes_inuse; i++) {
+			if (LANE_IS_NOT_FINISHED(state, i)) {
+				if (lanes)
+					len = min(len, state->lens[i]);
+				else
+					len = state->lens[i];
+				lane_idx_array[lanes] = i;
+				lanes++;
+			}
+		}
+		if (lanes == 0)
+			return -1;
+		lane_idx = len & 0xf;
+		len = len & (~0xf);
+#if SM3_MB_CE_MAX_LANES >=4
+		if (lanes == 4) {
+			sm3_mb_sm_x4(state->ldata[lane_idx_array[0]].job_in_lane,
+				     state->ldata[lane_idx_array[1]].job_in_lane,
+				     state->ldata[lane_idx_array[2]].job_in_lane,
+				     state->ldata[lane_idx_array[3]].job_in_lane, len >> 4);
+		} else
+#endif
+#if SM3_MB_CE_MAX_LANES >=3
+		if (lanes == 3) {
+			sm3_mb_sm_x3(state->ldata[lane_idx_array[0]].job_in_lane,
+				     state->ldata[lane_idx_array[1]].job_in_lane,
+				     state->ldata[lane_idx_array[2]].job_in_lane, len >> 4);
+		} else
+#endif
+#if SM3_MB_CE_MAX_LANES >=2
+		if (lanes == 2) {
+			sm3_mb_sm_x2(state->ldata[lane_idx_array[0]].job_in_lane,
+				     state->ldata[lane_idx_array[1]].job_in_lane, len >> 4);
+		} else
+#endif
+		{
+			sm3_mb_sm_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4);
+		}
+	}
+	//only return the min length job
+	for (i = 0; i < SM3_MAX_LANES; i++) {
+		if (LANE_IS_NOT_FINISHED(state, i)) {
+			state->lens[i] -= len;
+			state->ldata[i].job_in_lane->len -= len;
+			state->ldata[i].job_in_lane->buffer += len << 2;
+		}
+	}
+
+	return lane_idx;
+
+}
+
+static SM3_JOB *sm3_mb_mgr_free_lane(SM3_MB_JOB_MGR * state)
+{
+	int i;
+	SM3_JOB *ret = NULL;
+
+	for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) {
+		if (LANE_IS_FINISHED(state, i)) {
+
+			state->unused_lanes <<= 4;
+			state->unused_lanes |= i;
+			state->num_lanes_inuse--;
+			ret = state->ldata[i].job_in_lane;
+			ret->status = STS_COMPLETED;
+			state->ldata[i].job_in_lane = NULL;
+			break;
+		}
+	}
+	return ret;
+}
+
+static void sm3_mb_mgr_insert_job(SM3_MB_JOB_MGR * state, SM3_JOB * job)
+{
+	int lane_idx;
+	//add job into lanes
+	lane_idx = state->unused_lanes & 0xf;
+	//fatal error
+	assert(lane_idx < SM3_MB_CE_MAX_LANES);
+	state->lens[lane_idx] = (job->len << 4) | lane_idx;
+	state->ldata[lane_idx].job_in_lane = job;
+	state->unused_lanes >>= 4;
+	state->num_lanes_inuse++;
+}
+
+SM3_JOB *sm3_mb_mgr_submit_sm(SM3_MB_JOB_MGR * state, SM3_JOB * job)
+{
+#ifndef NDEBUG
+	int lane_idx;
+#endif
+	SM3_JOB *ret;
+
+	//add job into lanes
+	sm3_mb_mgr_insert_job(state, job);
+
+	ret = sm3_mb_mgr_free_lane(state);
+	if (ret != NULL) {
+		return ret;
+	}
+	//submit will wait all lane has data
+	if (state->num_lanes_inuse < SM3_MB_CE_MAX_LANES)
+		return NULL;
+#ifndef NDEBUG
+	lane_idx = sm3_mb_mgr_do_jobs(state);
+	assert(lane_idx != -1);
+#else
+	sm3_mb_mgr_do_jobs(state);
+#endif
+
+	ret = sm3_mb_mgr_free_lane(state);
+	return ret;
+}
+
+SM3_JOB *sm3_mb_mgr_flush_sm(SM3_MB_JOB_MGR * state)
+{
+	SM3_JOB *ret;
+	ret = sm3_mb_mgr_free_lane(state);
+	if (ret) {
+		return ret;
+	}
+
+	sm3_mb_mgr_do_jobs(state);
+	return sm3_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S
new file mode 100644
index 000000000..836bd9ccc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S
@@ -0,0 +1,36 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include "aarch64_multibinary.h"
+
+
+mbin_interface sm3_ctx_mgr_submit
+mbin_interface sm3_ctx_mgr_init
+mbin_interface sm3_ctx_mgr_flush
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S
new file mode 100644
index 000000000..f92ac5e9f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S
@@ -0,0 +1,237 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8.2-a+sm4
+	.text
+	.align	2
+	.p2align 3,,7
+
+.macro	declare_var_vector_reg name:req,reg:req
+	q\name\()	.req	q\reg
+	v\name\()	.req	v\reg
+	s\name\()	.req	s\reg
+.endm
+
+.macro message_expand	msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+	ext		v\msg4\().16b, v\msg1\().16b, v\msg2\().16b, #12
+	ext		v\tmp0\().16b, v\msg0\().16b, v\msg1\().16b, #12
+	ext		v\tmp1\().16b, v\msg2\().16b, v\msg3\().16b, #8
+	sm3partw1	v\msg4\().4s, v\msg0\().4s, v\msg3\().4s
+	sm3partw2	v\msg4\().4s, v\tmp1\().4s, v\tmp0\().4s
+
+.endm
+
+.macro	quad_round	ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req
+	eor		v\tmp0\().16b, v\msg0\().16b, v\msg1\().16b
+
+
+	sm3ss1		v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s
+	ext		v\const\().16b,v\const\().16b,v\const\().16b,12
+	sm3tt1\ab	v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[0]
+	sm3tt2\ab	v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[0]
+
+	sm3ss1		v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s
+	ext		v\const\().16b,v\const\().16b,v\const\().16b,12
+	sm3tt1\ab	v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[1]
+	sm3tt2\ab	v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[1]
+
+	sm3ss1		v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s
+	ext		v\const\().16b,v\const\().16b,v\const\().16b,12
+	sm3tt1\ab	v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[2]
+	sm3tt2\ab	v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[2]
+
+	sm3ss1		v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s
+	ext		v\const\().16b,v\const\().16b,v\const\().16b,12
+	sm3tt1\ab	v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[3]
+	sm3tt2\ab	v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[3]
+
+.endm
+
+.macro quad_round_expand	ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+	message_expand	\msg0,\msg1,\msg2,\msg3,\msg4,\tmp0,\tmp1
+	quad_round	\ab,\const,\dig0,\dig1,\msg0,\msg1,\tmp0,\tmp1
+.endm
+	job	.req	x0
+	len	.req	x1
+	data	.req	x2
+	digest	.req	x0
+	end_ptr	.req	x1
+
+
+	declare_var_vector_reg	msg0,0
+	declare_var_vector_reg	msg1,1
+	declare_var_vector_reg	msg2,2
+	declare_var_vector_reg	msg3,3
+	declare_var_vector_reg	msg4,4
+	declare_var_vector_reg	dig0,5
+	declare_var_vector_reg	dig1,6
+	declare_var_vector_reg	backup_dig0,  7
+
+	declare_var_vector_reg	tmp0,16
+	declare_var_vector_reg	tmp1,17
+	declare_var_vector_reg	backup_dig1,  18
+
+	declare_var_vector_reg	const0,19
+	declare_var_vector_reg	const1,20
+	declare_var_vector_reg	const2,21
+	declare_var_vector_reg	const3,22
+	declare_var_vector_reg	const4,23
+	declare_var_vector_reg	const5,24
+	declare_var_vector_reg	const6,25
+	declare_var_vector_reg	const7,26
+	declare_var_vector_reg	const8,27
+	declare_var_vector_reg	const9,28
+	declare_var_vector_reg	const10,29
+	declare_var_vector_reg	const11,30
+
+
+
+
+	.global	sm3_mb_sm_x1
+	.type	sm3_mb_sm_x1, %function
+sm3_mb_sm_x1:
+	adrp	x3,.consts
+	ldr	data, [job],64
+	add	x3,x3,:lo12:.consts
+	ldp	qdig0,qdig1,[digest]
+	ld1	{vconst0.16b-vconst3.16b},[x3],64
+	add	end_ptr,data,len,lsl 6
+	ld1	{vconst4.16b-vconst7.16b},[x3],64
+	//rev128
+	ext	vdig0.16b,vdig0.16b,vdig0.16b,#8
+	ext	vdig1.16b,vdig1.16b,vdig1.16b,#8
+	ld1	{vconst8.16b-vconst11.16b},[x3],64
+	rev64	vdig0.16b,vdig0.16b
+	rev64	vdig1.16b,vdig1.16b
+
+
+start_loop:
+	mov	vbackup_dig0.16b,vdig0.16b
+	mov	vbackup_dig1.16b,vdig1.16b
+	ldp	qmsg0,qmsg1,[data],32
+	ldp	qmsg2,qmsg3,[data],32
+
+	// big-endian to little-endian
+	rev32	vmsg0.16b,vmsg0.16b
+	rev32	vmsg1.16b,vmsg1.16b
+	rev32	vmsg2.16b,vmsg2.16b
+	rev32	vmsg3.16b,vmsg3.16b
+
+	quad_round_expand	a, const0, dig0, dig1,  msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+	quad_round_expand	a, const1, dig0, dig1,  msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+	quad_round_expand	a, const2, dig0, dig1,  msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+	quad_round_expand	a, const3, dig0, dig1,  msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+	quad_round_expand	b, const4, dig0, dig1,  msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+	quad_round_expand	b, const5, dig0, dig1,  msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+	quad_round_expand	b, const6, dig0, dig1,  msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+	quad_round_expand	b, const7, dig0, dig1,  msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+	quad_round_expand	b, const8, dig0, dig1,  msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+	quad_round_expand	b, const9, dig0, dig1,  msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+	quad_round_expand	b, const10, dig0, dig1,  msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+	quad_round_expand	b, const11, dig0, dig1,  msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+	quad_round_expand	b, const4, dig0, dig1,  msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+
+
+	quad_round		b, const5, dig0, dig1, msg3, msg4, tmp0, tmp1
+	cmp			data,end_ptr
+	quad_round		b, const6, dig0, dig1, msg4, msg0, tmp0, tmp1
+	quad_round		b, const7, dig0, dig1, msg0, msg1, tmp0, tmp1
+
+	eor			vdig0.16b,vdig0.16b,vbackup_dig0.16b
+	eor			vdig1.16b,vdig1.16b,vbackup_dig1.16b
+
+
+	bcc	start_loop
+
+	//rev128
+	ext	vdig0.16b,vdig0.16b,vdig0.16b,#8
+	ext	vdig1.16b,vdig1.16b,vdig1.16b,#8
+	rev64	vdig0.16b,vdig0.16b
+	rev64	vdig1.16b,vdig1.16b
+	str	qdig0,[digest]
+	str	qdig1,[digest,16]
+	ret
+	dsb	ish
+	isb
+	.align	2
+.consts:
+	.word	0xce6228cb	// 3
+	.word	0xe7311465	// 2
+	.word	0xf3988a32	// 1
+	.word	0x79cc4519	// 0
+	.word	0xe6228cbc	// 7
+	.word	0x7311465e	// 6
+	.word	0x3988a32f	// 5
+	.word	0x9cc45197	// 4
+	.word	0x6228cbce	//11
+	.word	0x311465e7	//10
+	.word	0x988a32f3	// 9
+	.word	0xcc451979	// 8
+	.word	0x228cbce6	//15
+	.word	0x11465e73	//14
+	.word	0x88a32f39	//13
+	.word	0xc451979c	//12
+	.word	0xec53d43c	//19
+	.word	0x7629ea1e	//18
+	.word	0x3b14f50f	//17
+	.word	0x9d8a7a87	//16
+	.word	0xc53d43ce	//23
+	.word	0x629ea1e7	//22
+	.word	0xb14f50f3	//21
+	.word	0xd8a7a879	//20
+	.word	0x53d43cec	//27
+	.word	0x29ea1e76	//26
+	.word	0x14f50f3b	//25
+	.word	0x8a7a879d	//24
+	.word	0x3d43cec5	//31
+	.word	0x9ea1e762	//30
+	.word	0x4f50f3b1	//29
+	.word	0xa7a879d8	//28
+	.word	0xd43cec53	//35
+	.word	0xea1e7629	//34
+	.word	0xf50f3b14	//33
+	.word	0x7a879d8a	//32
+	.word	0x43cec53d	//39
+	.word	0xa1e7629e	//38
+	.word	0x50f3b14f	//37
+	.word	0xa879d8a7	//36
+	.word	0x3cec53d4	//43
+	.word	0x1e7629ea	//42
+	.word	0x0f3b14f5	//41
+	.word	0x879d8a7a	//40
+	.word	0xcec53d43	//47
+	.word	0xe7629ea1	//46
+	.word	0xf3b14f50	//45
+	.word	0x79d8a7a8	//44
+	.word	0xec53d43c	//51
+	.word	0x7629ea1e	//50
+	.word	0x3b14f50f	//49
+
+
+	.size	sm3_mb_sm_x1, .-sm3_mb_sm_x1
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S
new file mode 100644
index 000000000..4e4a6e738
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S
@@ -0,0 +1,344 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8.2-a+sm4
+	.text
+	.align	2
+	.p2align 3,,7
+
+.macro	declare_var_vector_reg name:req,reg:req
+	q\name\()	.req	q\reg
+	v\name\()	.req	v\reg
+	s\name\()	.req	s\reg
+.endm
+
+.macro	do_ext	job,arg0,arg1,arg2,arg3
+	ext	v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b,\arg3
+.endm
+.macro	do_sm3partw1	job,msg4,msg0,msg3
+	sm3partw1	v\job\()_\msg4\().4s, v\job\()_\msg0\().4s, v\job\()_\msg3\().4s
+.endm
+.macro	do_sm3partw2	job,msg4,tmp1,tmp0
+	sm3partw2	v\job\()_\msg4\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s
+.endm
+
+.macro message_expand	msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+	.irp	j,0,1
+		do_ext	job\j,\msg4,\msg1,\msg2,#12
+	.endr
+	.irp	j,0,1
+		do_ext	job\j,\tmp0,\msg0,\msg1,#12
+	.endr
+	.irp	j,0,1
+		do_ext	job\j,\tmp1,\msg2,\msg3,#8
+	.endr
+
+	.irp	j,0,1
+		do_sm3partw1	job\j,\msg4, \msg0, \msg3
+	.endr
+	.irp	j,0,1
+		do_sm3partw2	job\j,\msg4, \tmp1, \tmp0
+	.endr
+
+.endm
+
+.macro do_eor	job,arg0,arg1,arg2
+	eor	v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b
+.endm
+.macro	do_sm3ss1	job,tmp1,dig0,dig1,const
+	sm3ss1		v\job\()_\tmp1\().4s, v\job\()_\dig0\().4s, v\job\()_\dig1\().4s, v\const\().4s
+.endm
+
+.macro do_sm3tt1	job,ab,dig0,tmp1,tmp0,lane
+	sm3tt1\ab	v\job\()_\dig0\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s[\lane]
+
+.endm
+.macro do_sm3tt2	job,ab,dig1,tmp1,msg0,lane
+	sm3tt2\ab	v\job\()_\dig1\().4s, v\job\()_\tmp1\().4s, v\job\()_\msg0\().4s[\lane]
+.endm
+
+.macro	quad_round	ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req
+	.irp	j,0,1
+		do_eor	job\j,\tmp0,\msg0,\msg1
+	.endr
+	.irp	lane,0,1,2,3
+		.irp	j,0,1
+			do_sm3ss1	job\j,\tmp1,\dig0,\dig1,\const
+		.endr
+
+		ext		v\const\().16b,v\const\().16b,v\const\().16b,12
+		.irp	j,0,1
+			do_sm3tt1	job\j,\ab,\dig0,\tmp1,\tmp0,\lane
+		.endr
+		.irp	j,0,1
+			do_sm3tt2	job\j,\ab,\dig1,\tmp1,\msg0,\lane
+		.endr
+	.endr
+.endm
+
+.macro quad_round_expand	ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+	message_expand	\msg0,\msg1,\msg2,\msg3,\msg4,\tmp0,\tmp1
+	quad_round	\ab,\const,\dig0,\dig1,\msg0,\msg1,\tmp0,\tmp1
+.endm
+
+/*
+	Variables
+*/
+	job0		.req	x0
+	job1		.req	x1
+	len		.req	x2
+
+	job0_data	.req	x3
+	job1_data	.req	x4
+	job0_digest	.req	x0
+	job1_digest	.req	x1
+
+	const_adr	.req	x5
+	end_ptr		.req	x2
+
+	declare_var_vector_reg	job0_msg0,          0
+	declare_var_vector_reg	job0_msg1,          1
+	declare_var_vector_reg	job0_msg2,          2
+	declare_var_vector_reg	job0_msg3,          3
+	declare_var_vector_reg	job0_msg4,          4
+	declare_var_vector_reg	job0_dig0,          5
+	declare_var_vector_reg	job0_dig1,          6
+	declare_var_vector_reg	job0_tmp0,          7
+	declare_var_vector_reg	job0_tmp1,          8
+	declare_var_vector_reg	job0_backup_dig0,   9
+	declare_var_vector_reg	job0_backup_dig1,  10
+
+	declare_var_vector_reg	job1_msg0,         11
+	declare_var_vector_reg	job1_msg1,         12
+	declare_var_vector_reg	job1_msg2,         13
+	declare_var_vector_reg	job1_msg3,         14
+	declare_var_vector_reg	job1_msg4,         15
+	declare_var_vector_reg	job1_dig0,         16
+	declare_var_vector_reg	job1_dig1,         17
+	declare_var_vector_reg	job1_tmp0,         18
+	declare_var_vector_reg	job1_tmp1,         19
+	declare_var_vector_reg	job1_backup_dig0,  20
+	declare_var_vector_reg	job1_backup_dig1,  21
+
+	declare_var_vector_reg	const0,            22
+	declare_var_vector_reg	const1,            23
+	declare_var_vector_reg	const2,            24
+	declare_var_vector_reg	const3,            25
+	declare_var_vector_reg	const4,            26
+	declare_var_vector_reg	const5,            27
+	declare_var_vector_reg	const6,            28
+	declare_var_vector_reg	const7,            29
+	declare_var_vector_reg	const8,            30
+	declare_var_vector_reg	const9,            31
+	declare_var_vector_reg	const10,           22
+	declare_var_vector_reg	const11,           23
+
+.macro do_rev32_msg	job:req,msg:req
+	rev32	v\job\()_\msg\().16b,v\job\()_\msg\().16b
+.endm
+.macro do_rev32_job	job:req
+	.irp	m,0,1,2,3
+	do_rev32_msg	\job,msg\m
+	.endr
+.endm
+.macro rev32_msgs
+	.irp	j,0,1
+	do_rev32_job	job\j
+	.endr
+.endm
+
+
+	.global	sm3_mb_sm_x2
+	.type	sm3_mb_sm_x2, %function
+sm3_mb_sm_x2:
+	//push d8~d15
+	stp 	d8,d9,[sp,-192]!
+	stp 	d10,d11,[sp,16]
+	stp 	d12,d13,[sp,32]
+	stp 	d14,d15,[sp,48]
+
+
+	adrp	const_adr,.consts
+	ldr	job0_data, [job0],64
+	add	const_adr,const_adr,:lo12:.consts
+	ldr	job1_data, [job1],64
+	ldp	qjob0_dig0,qjob0_dig1,[job0_digest]
+	ldp	qjob1_dig0,qjob1_dig1,[job1_digest]
+
+	ldp	qconst2,qconst3,[const_adr,32]
+	ldp	qconst4,qconst5,[const_adr,64]
+	ldp	qconst6,qconst7,[const_adr,96]
+	ldp	qconst8,qconst9,[const_adr,128]
+	add	end_ptr,job0_data,len,lsl 6
+
+	//rev128
+	ext	vjob0_dig0.16b,vjob0_dig0.16b,vjob0_dig0.16b,#8
+	ext	vjob0_dig1.16b,vjob0_dig1.16b,vjob0_dig1.16b,#8
+	rev64	vjob0_dig0.16b,vjob0_dig0.16b
+	rev64	vjob0_dig1.16b,vjob0_dig1.16b
+	ext	vjob1_dig0.16b,vjob1_dig0.16b,vjob1_dig0.16b,#8
+	ext	vjob1_dig1.16b,vjob1_dig1.16b,vjob1_dig1.16b,#8
+	rev64	vjob1_dig0.16b,vjob1_dig0.16b
+	rev64	vjob1_dig1.16b,vjob1_dig1.16b
+
+
+
+
+
+start_loop:
+
+	ld1	{vjob0_msg0.16b-vjob0_msg3.16b},[job0_data],64
+	ld1	{vjob1_msg0.16b-vjob1_msg3.16b},[job1_data],64
+
+	mov	vjob0_backup_dig0.16b,vjob0_dig0.16b
+	mov	vjob0_backup_dig1.16b,vjob0_dig1.16b
+	mov	vjob1_backup_dig0.16b,vjob1_dig0.16b
+	mov	vjob1_backup_dig1.16b,vjob1_dig1.16b
+
+	// const10,const11,const0,const1 share registers
+	ldp	qconst0,qconst1,[const_adr]
+
+	// big-endian to little-endian
+	rev32_msgs
+
+	cmp			job0_data,end_ptr
+	quad_round_expand	a, const0 , dig0, dig1,  msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+
+
+	quad_round_expand	a, const1 , dig0, dig1,  msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+	// const10,const11,const0,const1 share registers
+	ldp	qconst10,qconst11,[const_adr,160]
+	quad_round_expand	a, const2 , dig0, dig1,  msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+	quad_round_expand	a, const3 , dig0, dig1,  msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+	quad_round_expand	b, const4 , dig0, dig1,  msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+	quad_round_expand	b, const5 , dig0, dig1,  msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+	quad_round_expand	b, const6 , dig0, dig1,  msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+	quad_round_expand	b, const7 , dig0, dig1,  msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+	quad_round_expand	b, const8 , dig0, dig1,  msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+	quad_round_expand	b, const9 , dig0, dig1,  msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+	quad_round_expand	b, const10, dig0, dig1,  msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+	quad_round_expand	b, const11, dig0, dig1,  msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+	quad_round_expand	b, const4 , dig0, dig1,  msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+
+
+	quad_round		b, const5, dig0, dig1, msg3, msg4, tmp0, tmp1
+
+	quad_round		b, const6, dig0, dig1, msg4, msg0, tmp0, tmp1
+	quad_round		b, const7, dig0, dig1, msg0, msg1, tmp0, tmp1
+
+	eor			vjob0_dig0.16b,vjob0_dig0.16b,vjob0_backup_dig0.16b
+	eor			vjob0_dig1.16b,vjob0_dig1.16b,vjob0_backup_dig1.16b
+	eor			vjob1_dig0.16b,vjob1_dig0.16b,vjob1_backup_dig0.16b
+	eor			vjob1_dig1.16b,vjob1_dig1.16b,vjob1_backup_dig1.16b
+
+
+	bcc	start_loop
+
+	//rev128
+	ext	vjob0_dig0.16b,vjob0_dig0.16b,vjob0_dig0.16b,#8
+	ext	vjob0_dig1.16b,vjob0_dig1.16b,vjob0_dig1.16b,#8
+	rev64	vjob0_dig0.16b,vjob0_dig0.16b
+	rev64	vjob0_dig1.16b,vjob0_dig1.16b
+	stp	qjob0_dig0,qjob0_dig1,[job0_digest]
+
+	ext	vjob1_dig0.16b,vjob1_dig0.16b,vjob1_dig0.16b,#8
+	ext	vjob1_dig1.16b,vjob1_dig1.16b,vjob1_dig1.16b,#8
+	rev64	vjob1_dig0.16b,vjob1_dig0.16b
+	rev64	vjob1_dig1.16b,vjob1_dig1.16b
+	stp	qjob1_dig0,qjob1_dig1,[job1_digest]
+
+#if 1
+	mov 	v0.16b,vjob1_dig0.16b
+	mov 	v1.16b,vjob1_dig1.16b
+	b exit_ret
+#endif
+
+exit_ret:
+	ldp 	d10,d11,[sp,16]
+	ldp 	d12,d13,[sp,32]
+	ldp 	d14,d15,[sp,48]
+	ldp     d8, d9, [sp], 192
+	ret
+
+	.align	2
+.consts:
+	.word	0xce6228cb	// 3
+	.word	0xe7311465	// 2
+	.word	0xf3988a32	// 1
+	.word	0x79cc4519	// 0
+	.word	0xe6228cbc	// 7
+	.word	0x7311465e	// 6
+	.word	0x3988a32f	// 5
+	.word	0x9cc45197	// 4
+	.word	0x6228cbce	//11
+	.word	0x311465e7	//10
+	.word	0x988a32f3	// 9
+	.word	0xcc451979	// 8
+	.word	0x228cbce6	//15
+	.word	0x11465e73	//14
+	.word	0x88a32f39	//13
+	.word	0xc451979c	//12
+	.word	0xec53d43c	//19
+	.word	0x7629ea1e	//18
+	.word	0x3b14f50f	//17
+	.word	0x9d8a7a87	//16
+	.word	0xc53d43ce	//23
+	.word	0x629ea1e7	//22
+	.word	0xb14f50f3	//21
+	.word	0xd8a7a879	//20
+	.word	0x53d43cec	//27
+	.word	0x29ea1e76	//26
+	.word	0x14f50f3b	//25
+	.word	0x8a7a879d	//24
+	.word	0x3d43cec5	//31
+	.word	0x9ea1e762	//30
+	.word	0x4f50f3b1	//29
+	.word	0xa7a879d8	//28
+	.word	0xd43cec53	//35
+	.word	0xea1e7629	//34
+	.word	0xf50f3b14	//33
+	.word	0x7a879d8a	//32
+	.word	0x43cec53d	//39
+	.word	0xa1e7629e	//38
+	.word	0x50f3b14f	//37
+	.word	0xa879d8a7	//36
+	.word	0x3cec53d4	//43
+	.word	0x1e7629ea	//42
+	.word	0x0f3b14f5	//41
+	.word	0x879d8a7a	//40
+	.word	0xcec53d43	//47
+	.word	0xe7629ea1	//46
+	.word	0xf3b14f50	//45
+	.word	0x79d8a7a8	//44
+	.word	0xec53d43c	//51
+	.word	0x7629ea1e	//50
+	.word	0x3b14f50f	//49
+
+
+	.size	sm3_mb_sm_x2, .-sm3_mb_sm_x2
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S
new file mode 100644
index 000000000..58758f98d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S
@@ -0,0 +1,368 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	//dsdf
+	.arch armv8.2-a+sm4
+	.text
+	.align	2
+	.p2align 3,,7
+
+.macro	declare_var_vector_reg name:req,reg:req
+	q\name\()	.req	q\reg
+	v\name\()	.req	v\reg
+	s\name\()	.req	s\reg
+.endm
+
+.macro	do_ext	job,arg0,arg1,arg2,arg3
+	ext	vjob\job\()_\arg0\().16b,vjob\job\()_\arg1\().16b,vjob\job\()_\arg2\().16b,\arg3
+.endm
+.macro	do_sm3partw1	job,msg4,msg0,msg3
+	sm3partw1	vjob\job\()_\msg4\().4s, vjob\job\()_\msg0\().4s, vjob\job\()_\msg3\().4s
+.endm
+.macro	do_sm3partw2	job,msg4,tmp1,tmp0
+	sm3partw2	vjob\job\()_\msg4\().4s, vjob\job\()_\tmp1\().4s, vjob\job\()_\tmp0\().4s
+.endm
+
+.macro message_expand	msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+	.irp	j,0,1,2
+		do_ext	\j,\msg4,\msg1,\msg2,#12
+	.endr
+	.irp	j,0,1,2
+		do_ext	\j,\tmp0,\msg0,\msg1,#12
+	.endr
+	.irp	j,0,1,2
+		do_ext	\j,\tmp1,\msg2,\msg3,#8
+	.endr
+
+	.irp	j,0,1,2
+		do_sm3partw1	\j,\msg4, \msg0, \msg3
+	.endr
+	.irp	j,0,1,2
+		do_sm3partw2	\j,\msg4, \tmp1, \tmp0
+	.endr
+
+.endm
+
+.macro do_eor	job,arg0,arg1,arg2
+	eor	v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b
+.endm
+.macro	do_sm3ss1	job,tmp1,dig0,dig1,const
+	sm3ss1		v\job\()_\tmp1\().4s, v\job\()_\dig0\().4s, v\job\()_\dig1\().4s, v\const\().4s
+.endm
+
+.macro do_sm3tt1	job,ab,dig0,tmp1,tmp0,lane
+	sm3tt1\ab	v\job\()_\dig0\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s[\lane]
+
+.endm
+.macro do_sm3tt2	job,ab,dig1,tmp1,msg0,lane
+	sm3tt2\ab	v\job\()_\dig1\().4s, v\job\()_\tmp1\().4s, v\job\()_\msg0\().4s[\lane]
+.endm
+.macro do_ld_backup_digest	job
+	ldp	qjob\job\()_backup_dig0,qjob\job\()_backup_dig1,[sp,job\job\()_dig_off]
+.endm
+
+.macro	do_st_digest	job
+	stp	qjob\job\()_dig0,qjob\job\()_dig1,[job\job\()_digest]
+.endm
+.macro	quad_round	ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req,load_digest
+	.irp	j,0,1,2
+		do_eor	job\j,\tmp0,\msg0,\msg1
+		.ifnb	\load_digest
+			do_ld_backup_digest \j
+		.endif
+	.endr
+	.irp	lane,0,1,2,3
+		.irp	j,0,1,2
+			do_sm3ss1	job\j,\tmp1,\dig0,\dig1,\const
+		.endr
+
+		ext		v\const\().16b,v\const\().16b,v\const\().16b,12
+		.irp	j,0,1,2
+			do_sm3tt1	job\j,\ab,\dig0,\tmp1,\tmp0,\lane
+		.endr
+		.irp	j,0,1,2
+			do_sm3tt2	job\j,\ab,\dig1,\tmp1,\msg0,\lane
+		.endr
+
+	.endr
+.endm
+
+.macro quad_round_expand	ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+	message_expand	\msg0,\msg1,\msg2,\msg3,\msg4,\tmp0,\tmp1
+	quad_round	\ab,\const,\dig0,\dig1,\msg0,\msg1,\tmp0,\tmp1
+.endm
+
+/*
+	Variables
+*/
+	job0		.req	x0
+	job1		.req	x1
+	job2		.req	x2
+	len		.req	x3
+
+	job0_data	.req	x4
+	job1_data	.req	x5
+	job2_data	.req	x6
+	job0_digest	.req	x0
+	job1_digest	.req	x1
+	job2_digest	.req	x2
+
+	const_adr	.req	x7
+	end_ptr		.req	x3
+
+	declare_var_vector_reg	job0_msg0,          0
+	declare_var_vector_reg	job0_msg1,          1
+	declare_var_vector_reg	job0_msg2,          2
+	declare_var_vector_reg	job0_msg3,          3
+	declare_var_vector_reg	job0_msg4,          4
+	declare_var_vector_reg	job0_dig0,          5
+	declare_var_vector_reg	job0_dig1,          6
+	declare_var_vector_reg	job0_tmp0,          7
+	declare_var_vector_reg	job0_tmp1,          8
+	.set			job0_dig_off,      64
+	declare_var_vector_reg	job0_backup_dig0,   2
+	declare_var_vector_reg	job0_backup_dig1,   3
+
+	declare_var_vector_reg	job1_msg0,          9
+	declare_var_vector_reg	job1_msg1,         10
+	declare_var_vector_reg	job1_msg2,         11
+	declare_var_vector_reg	job1_msg3,         12
+	declare_var_vector_reg	job1_msg4,         13
+	declare_var_vector_reg	job1_dig0,         14
+	declare_var_vector_reg	job1_dig1,         15
+	declare_var_vector_reg	job1_tmp0,         16
+	declare_var_vector_reg	job1_tmp1,         17
+	.set			job1_dig_off,      96
+	declare_var_vector_reg	job1_backup_dig0,  11
+	declare_var_vector_reg	job1_backup_dig1,  12
+
+	declare_var_vector_reg	job2_msg0,         18
+	declare_var_vector_reg	job2_msg1,         19
+	declare_var_vector_reg	job2_msg2,         20
+	declare_var_vector_reg	job2_msg3,         21
+	declare_var_vector_reg	job2_msg4,         22
+	declare_var_vector_reg	job2_dig0,         23
+	declare_var_vector_reg	job2_dig1,         24
+	declare_var_vector_reg	job2_tmp0,         25
+	declare_var_vector_reg	job2_tmp1,         26
+	.set			job2_dig_off,     128
+	declare_var_vector_reg	job2_backup_dig0,  20
+	declare_var_vector_reg	job2_backup_dig1,  21
+
+
+	declare_var_vector_reg	const0,            27
+	declare_var_vector_reg	const1,            28
+	declare_var_vector_reg	const2,            29
+	declare_var_vector_reg	const3,            30
+	declare_var_vector_reg	const4,            27
+	declare_var_vector_reg	const5,            28
+	declare_var_vector_reg	const6,            29
+	declare_var_vector_reg	const7,            30
+	declare_var_vector_reg	const8,            27
+	declare_var_vector_reg	const9,            28
+	declare_var_vector_reg	const10,           29
+	declare_var_vector_reg	const11,           30
+
+.macro do_rev32_msg	job:req,msg:req
+	rev32	v\job\()_\msg\().16b,v\job\()_\msg\().16b
+.endm
+.macro do_rev32_job	job:req
+	.irp	m,0,1,2,3
+	do_rev32_msg	\job,msg\m
+	.endr
+.endm
+.macro rev32_msgs
+	.irp	j,0,1,2
+	do_rev32_job	job\j
+	.endr
+.endm
+
+.macro do_rev64		job,regd,regn
+	rev64		vjob\job\()_\regd\().16b,vjob\job\()_\regd\().16b
+.endm
+
+	.global	sm3_mb_sm_x3
+	.type	sm3_mb_sm_x3, %function
+sm3_mb_sm_x3:
+	//push d8~d15
+	stp 	d8,d9,[sp,-192]!
+	stp 	d10,d11,[sp,16]
+	stp 	d12,d13,[sp,32]
+	stp 	d14,d15,[sp,48]
+
+
+	adrp	const_adr,.consts
+	ldr	job0_data, [job0],64
+	add	const_adr,const_adr,:lo12:.consts
+	ldr	job1_data, [job1],64
+	ldr	job2_data, [job2],64
+
+	ldp	qjob0_dig0,qjob0_dig1,[job0_digest]
+	ldp	qjob1_dig0,qjob1_dig1,[job1_digest]
+	ldp	qjob2_dig0,qjob2_dig1,[job2_digest]
+	ld1	{vconst0.16b-vconst3.16b},[const_adr]
+	add	end_ptr,job0_data,len,lsl 6
+
+	//rev128
+	.irp	j,0,1,2
+		do_ext		\j,dig0,dig0,dig0,#8
+		do_ext		\j,dig1,dig1,dig1,#8
+		do_rev64	\j,dig0,dig0
+		do_rev64	\j,dig1,dig1
+	.endr
+
+
+
+
+
+start_loop:
+
+	ld1	{vjob0_msg0.16b-vjob0_msg3.16b},[job0_data],64
+	stp	qjob0_dig0,qjob0_dig1,[sp,job0_dig_off]
+	ld1	{vjob1_msg0.16b-vjob1_msg3.16b},[job1_data],64
+	stp	qjob1_dig0,qjob1_dig1,[sp,job1_dig_off]
+	ld1	{vjob2_msg0.16b-vjob2_msg3.16b},[job2_data],64
+	stp	qjob2_dig0,qjob2_dig1,[sp,job2_dig_off]
+
+	cmp			job0_data,end_ptr
+
+	// big-endian to little-endian
+	rev32_msgs
+
+	quad_round_expand	a, const0 , dig0, dig1,  msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+	quad_round_expand	a, const1 , dig0, dig1,  msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+
+	ldp			qconst4,qconst5,[const_adr,4*16]
+	quad_round_expand	a, const2 , dig0, dig1,  msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+	quad_round_expand	a, const3 , dig0, dig1,  msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+
+	ldp			qconst6,qconst7,[const_adr,6*16]
+	quad_round_expand	b, const4 , dig0, dig1,  msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+	quad_round_expand	b, const5 , dig0, dig1,  msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+	ldp			qconst8,qconst9,[const_adr,8*16]
+	quad_round_expand	b, const6 , dig0, dig1,  msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+	quad_round_expand	b, const7 , dig0, dig1,  msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+	ldp			qconst10,qconst11,[const_adr,10*16]
+	quad_round_expand	b, const8 , dig0, dig1,  msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+	quad_round_expand	b, const9 , dig0, dig1,  msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+
+	ldp			qconst4,qconst5,[const_adr,4*16]
+	quad_round_expand	b, const10, dig0, dig1,  msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+	quad_round_expand	b, const11, dig0, dig1,  msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+	ldp			qconst6,qconst7,[const_adr,6*16]
+	quad_round_expand	b, const4 , dig0, dig1,  msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+
+	quad_round		b, const5, dig0, dig1, msg3, msg4, tmp0, tmp1
+	ldp			qconst0,qconst1,[const_adr]
+	quad_round		b, const6, dig0, dig1, msg4, msg0, tmp0, tmp1
+
+	quad_round		b, const7, dig0, dig1, msg0, msg1, tmp0, tmp1,1
+	ldp			qconst2,qconst3,[const_adr,2*16]
+
+	.irp	j,0,1,2
+		do_eor			job\j,dig0,dig0,backup_dig0
+		do_eor			job\j,dig1,dig1,backup_dig1
+	.endr
+
+	bcc	start_loop
+
+	//rev128
+	.irp	j,0,1,2
+		do_ext		\j,dig0,dig0,dig0,#8
+		do_ext		\j,dig1,dig1,dig1,#8
+		do_rev64	\j,dig0,dig0
+		do_rev64	\j,dig1,dig1
+		do_st_digest	\j
+	.endr
+
+
+
+exit_ret:
+	ldp 	d10,d11,[sp,16]
+	ldp 	d12,d13,[sp,32]
+	ldp 	d14,d15,[sp,48]
+	ldp     d8, d9, [sp], 192
+	ret
+
+	.align	2
+.consts:
+	.word	0xce6228cb	// 3
+	.word	0xe7311465	// 2
+	.word	0xf3988a32	// 1
+	.word	0x79cc4519	// 0
+	.word	0xe6228cbc	// 7
+	.word	0x7311465e	// 6
+	.word	0x3988a32f	// 5
+	.word	0x9cc45197	// 4
+	.word	0x6228cbce	//11
+	.word	0x311465e7	//10
+	.word	0x988a32f3	// 9
+	.word	0xcc451979	// 8
+	.word	0x228cbce6	//15
+	.word	0x11465e73	//14
+	.word	0x88a32f39	//13
+	.word	0xc451979c	//12
+	.word	0xec53d43c	//19
+	.word	0x7629ea1e	//18
+	.word	0x3b14f50f	//17
+	.word	0x9d8a7a87	//16
+	.word	0xc53d43ce	//23
+	.word	0x629ea1e7	//22
+	.word	0xb14f50f3	//21
+	.word	0xd8a7a879	//20
+	.word	0x53d43cec	//27
+	.word	0x29ea1e76	//26
+	.word	0x14f50f3b	//25
+	.word	0x8a7a879d	//24
+	.word	0x3d43cec5	//31
+	.word	0x9ea1e762	//30
+	.word	0x4f50f3b1	//29
+	.word	0xa7a879d8	//28
+	.word	0xd43cec53	//35
+	.word	0xea1e7629	//34
+	.word	0xf50f3b14	//33
+	.word	0x7a879d8a	//32
+	.word	0x43cec53d	//39
+	.word	0xa1e7629e	//38
+	.word	0x50f3b14f	//37
+	.word	0xa879d8a7	//36
+	.word	0x3cec53d4	//43
+	.word	0x1e7629ea	//42
+	.word	0x0f3b14f5	//41
+	.word	0x879d8a7a	//40
+	.word	0xcec53d43	//47
+	.word	0xe7629ea1	//46
+	.word	0xf3b14f50	//45
+	.word	0x79d8a7a8	//44
+	.word	0xec53d43c	//51
+	.word	0x7629ea1e	//50
+	.word	0x3b14f50f	//49
+
+
+	.size	sm3_mb_sm_x3, .-sm3_mb_sm_x3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S
new file mode 100644
index 000000000..7f3f1db66
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S
@@ -0,0 +1,440 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+	.arch armv8.2-a+sm4
+	.text
+	.align	2
+	.p2align 3,,7
+
+.macro	declare_var_vector_reg name:req,reg:req
+	q\name\()	.req	q\reg
+	v\name\()	.req	v\reg
+	s\name\()	.req	s\reg
+.endm
+
+.macro	do_ext	job,arg0,arg1,arg2,arg3
+	ext	vjob\job\()_\arg0\().16b,vjob\job\()_\arg1\().16b,vjob\job\()_\arg2\().16b,\arg3
+.endm
+.macro	do_sm3partw1	job,msg4,msg0,msg3
+	sm3partw1	vjob\job\()_\msg4\().4s, vjob\job\()_\msg0\().4s, vjob\job\()_\msg3\().4s
+.endm
+.macro	do_sm3partw2	job,msg4,tmp1,tmp0
+	sm3partw2	vjob\job\()_\msg4\().4s, vjob\job\()_\tmp1\().4s, vjob\job\()_\tmp0\().4s
+.endm
+
+.macro message_expand	msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+	.irp	j,0,1,2,3
+		do_ext	\j,\msg4,\msg1,\msg2,#12
+	.endr
+	.irp	j,0,1,2,3
+		do_ext	\j,\tmp0,\msg0,\msg1,#12
+	.endr
+	.irp	j,0,1,2,3
+		do_ext	\j,\tmp1,\msg2,\msg3,#8
+	.endr
+
+	.irp	j,0,1,2,3
+		do_sm3partw1	\j,\msg4, \msg0, \msg3
+	.endr
+	.irp	j,0,1,2,3
+		do_sm3partw2	\j,\msg4, \tmp1, \tmp0
+	.endr
+	st1	{vjob0_\msg4\().16b-vjob3_\msg4\().16b},[data_buf],64
+.endm
+
+.macro do_eor	job,arg0,arg1,arg2
+	eor	v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b
+.endm
+.macro	do_sm3ss1	job,tmp1,dig0,dig1,const
+	sm3ss1		v\job\()_\tmp1\().4s, v\job\()_\dig0\().4s, v\job\()_\dig1\().4s, v\const\().4s
+.endm
+
+.macro do_sm3tt1	job,ab,dig0,tmp1,tmp0,lane
+	sm3tt1\ab	v\job\()_\dig0\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s[\lane]
+
+.endm
+.macro do_sm3tt2	job,ab,dig1,tmp1,msg0,lane
+	sm3tt2\ab	v\job\()_\dig1\().4s, v\job\()_\tmp1\().4s, v\job\()_\msg0\().4s[\lane]
+.endm
+.macro do_ld_backup_digest	job
+	ldp	qjob\job\()_backup_dig0,qjob\job\()_backup_dig1,[sp,job\job\()_dig_off]
+.endm
+
+.macro	do_st_digest	job
+	stp	qjob\job\()_dig0,qjob\job\()_dig1,[job\job\()_digest]
+.endm
+
+.macro	quad_round	ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req,is_last
+	.ifnb	\is_last
+		ld1		{vjob0_backup_dig0.16b-vjob3_backup_dig0.16b},[dig_buf],64
+	.endif
+
+	.irp	j,0,1,2,3
+		do_eor	job\j,\tmp0,\msg0,\msg1
+
+	.endr
+
+	.irp	lane,0,1,2
+		.irp	j,0,1,2,3
+			do_sm3ss1	job\j,\tmp1,\dig0,\dig1,\const
+		.endr
+		ext		v\const\().16b,v\const\().16b,v\const\().16b,12
+		.irp	j,0,1,2,3
+			do_sm3tt2	job\j,\ab,\dig1,\tmp1,\msg0,\lane
+		.endr
+		.irp	j,0,1,2,3
+			do_sm3tt1	job\j,\ab,\dig0,\tmp1,\tmp0,\lane
+		.endr
+
+
+	.endr
+	.irp	j,0,1,2,3
+		do_sm3ss1	job\j,\tmp1,\dig0,\dig1,\const
+	.endr
+	.ifnb	\is_last
+
+		ld1	{vjob0_backup_dig1.16b-vjob3_backup_dig1.16b},[dig_buf]
+	.else
+		ext		v\const\().16b,v\const\().16b,v\const\().16b,12
+	.endif
+	.irp	j,0,1,2,3
+		do_sm3tt2	job\j,\ab,\dig1,\tmp1,\msg0,3
+	.endr
+
+	.irp	j,0,1,2,3
+		do_sm3tt1	job\j,\ab,\dig0,\tmp1,\tmp0,3
+		.ifnb	\is_last
+			do_eor	job\j,dig1,dig1,backup_dig1
+			do_eor	job\j,dig0,dig0,backup_dig0
+		.endif
+	.endr
+
+	.ifb	\is_last
+		ld1	{vjob0_\msg0\().16b-vjob3_\msg0\().16b},[data_buf],64
+	.endif
+
+.endm
+
+
+
+/*
+	Variables
+*/
+	.set		temp_buf_size,(68*4+32)*4
+	.set		dig_buf_off,64
+	.set		data_buf_off,64+32*4
+	job0		.req	x0
+	job1		.req	x1
+	job2		.req	x2
+	job3		.req	x3
+	len		.req	x4
+
+	job0_data	.req	x5
+	job1_data	.req	x6
+	job2_data	.req	x7
+	job3_data	.req	x9
+
+	job0_digest	.req	x0
+	job1_digest	.req	x1
+	job2_digest	.req	x2
+	job3_digest	.req	x3
+
+	const_adr	.req	x10
+	end_ptr		.req	x4
+	data_buf	.req	x11
+	dig_buf		.req	x12
+
+	declare_var_vector_reg	job0_msg0,          0
+	declare_var_vector_reg	job1_msg0,          1
+	declare_var_vector_reg	job2_msg0,          2
+	declare_var_vector_reg	job3_msg0,          3
+	declare_var_vector_reg	job0_msg1,          4
+	declare_var_vector_reg	job1_msg1,          5
+	declare_var_vector_reg	job2_msg1,          6
+	declare_var_vector_reg	job3_msg1,          7
+	declare_var_vector_reg	job0_msg2,          8
+	declare_var_vector_reg	job1_msg2,          9
+	declare_var_vector_reg	job2_msg2,         10
+	declare_var_vector_reg	job3_msg2,         11
+	declare_var_vector_reg	job0_msg3,         12
+	declare_var_vector_reg	job1_msg3,         13
+	declare_var_vector_reg	job2_msg3,         14
+	declare_var_vector_reg	job3_msg3,         15
+	declare_var_vector_reg	job0_tmp0,         16
+	declare_var_vector_reg	job1_tmp0,         17
+	declare_var_vector_reg	job2_tmp0,         18
+	declare_var_vector_reg	job3_tmp0,         19
+	declare_var_vector_reg	job0_tmp1,         20
+	declare_var_vector_reg	job1_tmp1,         21
+	declare_var_vector_reg	job2_tmp1,         22
+	declare_var_vector_reg	job3_tmp1,         23
+	declare_var_vector_reg	job0_msg4,         24
+	declare_var_vector_reg	job1_msg4,         25
+	declare_var_vector_reg	job2_msg4,         26
+	declare_var_vector_reg	job3_msg4,         27
+	declare_var_vector_reg	job0_dig0,          8
+	declare_var_vector_reg	job1_dig0,          9
+	declare_var_vector_reg	job2_dig0,         10
+	declare_var_vector_reg	job3_dig0,         11
+	declare_var_vector_reg	job0_dig1,         12
+	declare_var_vector_reg	job1_dig1,         13
+	declare_var_vector_reg	job2_dig1,         14
+	declare_var_vector_reg	job3_dig1,         15
+
+	declare_var_vector_reg	job0_backup_dig0,  24
+	declare_var_vector_reg	job1_backup_dig0,  25
+	declare_var_vector_reg	job2_backup_dig0,  26
+	declare_var_vector_reg	job3_backup_dig0,  27
+	declare_var_vector_reg	job0_backup_dig1,  28
+	declare_var_vector_reg	job1_backup_dig1,  29
+	declare_var_vector_reg	job2_backup_dig1,  30
+	declare_var_vector_reg	job3_backup_dig1,  31
+
+	declare_var_vector_reg	const0,            24
+	declare_var_vector_reg	const1,            25
+	declare_var_vector_reg	const2,            26
+	declare_var_vector_reg	const3,            27
+	declare_var_vector_reg	const4,            28
+	declare_var_vector_reg	const5,            29
+	declare_var_vector_reg	const6,            30
+	declare_var_vector_reg	const7,            31
+	declare_var_vector_reg	const8,            24
+	declare_var_vector_reg	const9,            25
+	declare_var_vector_reg	const10,           26
+	declare_var_vector_reg	const11,           27
+
+.macro do_rev32_msg	job:req,msg:req
+	rev32	v\job\()_\msg\().16b,v\job\()_\msg\().16b
+.endm
+
+.macro do_rev32_job	job:req
+	.irp	m,0,1,2,3
+	do_rev32_msg	\job,msg\m
+	.endr
+.endm
+
+.macro rev32_msgs
+	.irp	j,0,1,2,3
+	do_rev32_job	job\j
+	.endr
+.endm
+
+.macro do_rev64		job,regd,regn
+	rev64		vjob\job\()_\regd\().16b,vjob\job\()_\regd\().16b
+.endm
+
+.macro do_ldp_msg23	job
+	ldp	qjob\job\()_msg2,qjob\job\()_msg3,[job\job\()_data],32
+.endm
+
+	.global	sm3_mb_sm_x4
+	.type	sm3_mb_sm_x4, %function
+sm3_mb_sm_x4:
+	//push d8~d15
+	sub	sp,sp,temp_buf_size
+	stp 	d8,d9,[sp,-64]!
+	stp 	d10,d11,[sp,16]
+	stp 	d12,d13,[sp,32]
+	stp 	d14,d15,[sp,48]
+
+
+
+	ldr	job0_data, [job0],64
+	ldr	job1_data, [job1],64
+	ldr	job2_data, [job2],64
+	ldr	job3_data, [job3],64
+
+	ldp	qjob0_dig0,qjob0_dig1,[job0_digest]
+	ldp	qjob1_dig0,qjob1_dig1,[job1_digest]
+	ldp	qjob2_dig0,qjob2_dig1,[job2_digest]
+	ldp	qjob3_dig0,qjob3_dig1,[job3_digest]
+	add	end_ptr,job0_data,len,lsl 6
+	//rev128,change digest endian
+	.irp	j,0,1,2,3
+		do_ext		\j,dig0,dig0,dig0,#8
+		do_ext		\j,dig1,dig1,dig1,#8
+		do_rev64	\j,dig0,dig0
+		do_rev64	\j,dig1,dig1
+	.endr
+
+
+
+
+start_loop:
+	add	dig_buf,sp,dig_buf_off
+	ldp	qjob0_msg0,qjob0_msg1,[job0_data],32
+	add	data_buf,sp,data_buf_off
+	ldp	qjob1_msg0,qjob1_msg1,[job1_data],32
+	st1	{vjob0_dig0.16b-vjob3_dig0.16b},[dig_buf],64
+	ldp	qjob2_msg0,qjob2_msg1,[job2_data],32
+	st1	{vjob0_dig1.16b-vjob3_dig1.16b},[dig_buf]
+	ldp	qjob3_msg0,qjob3_msg1,[job3_data],32
+
+	.irp	j,0,1,2,3
+		do_ldp_msg23	\j
+		do_rev32_msg	job\j,msg0
+		do_rev32_msg	job\j,msg1
+	.endr
+	st1	{vjob0_msg0.16b-vjob3_msg0.16b},[data_buf],64
+	st1	{vjob0_msg1.16b-vjob3_msg1.16b},[data_buf],64
+	.irp	j,0,1,2,3
+		do_rev32_msg	job\j,msg2
+		do_rev32_msg	job\j,msg3
+	.endr
+	st1	{vjob0_msg2.16b-vjob3_msg2.16b},[data_buf],64
+	st1	{vjob0_msg3.16b-vjob3_msg3.16b},[data_buf],64
+
+	cmp			job0_data,end_ptr
+
+	/** message expand **/
+	message_expand	msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+	message_expand	msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+	message_expand	msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+	message_expand	msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+	message_expand	msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+	message_expand	msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+	message_expand	msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+	message_expand	msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+	message_expand	msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+	message_expand	msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+	message_expand	msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+	message_expand	msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+	message_expand	msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+
+	/** re-init variables for sm3 rounds **/
+	add		dig_buf,sp,dig_buf_off
+	ld1		{vjob0_dig0.16b-vjob3_dig0.16b},[dig_buf],64
+	add		data_buf,sp,data_buf_off
+	ld1		{vjob0_dig1.16b-vjob3_dig1.16b},[dig_buf]
+	add		dig_buf,sp,dig_buf_off
+	adrp		const_adr,.consts
+	ld1		{vjob0_msg0.16b-vjob3_msg0.16b},[data_buf],64
+	add		const_adr,const_adr,:lo12:.consts
+	ld1		{vjob0_msg1.16b-vjob3_msg1.16b},[data_buf],64
+	ld1		{vconst0.16b-vconst3.16b},[const_adr],64
+	ld1		{vconst4.16b-vconst7.16b},[const_adr],64
+	/** digests rounds **/
+	quad_round	a, const0 , dig0, dig1,  msg0, msg1, tmp0, tmp1
+	quad_round	a, const1 , dig0, dig1,  msg1, msg0, tmp0, tmp1
+	quad_round	a, const2 , dig0, dig1,  msg0, msg1, tmp0, tmp1
+	quad_round	a, const3 , dig0, dig1,  msg1, msg0, tmp0, tmp1
+
+	/** share registers with vconst0-vconst3 **/
+	ld1		{vconst8.16b-vconst11.16b},[const_adr]
+
+	quad_round	b, const4 , dig0, dig1,  msg0, msg1, tmp0, tmp1
+	quad_round	b, const5 , dig0, dig1,  msg1, msg0, tmp0, tmp1
+	quad_round	b, const6 , dig0, dig1,  msg0, msg1, tmp0, tmp1
+	quad_round	b, const7 , dig0, dig1,  msg1, msg0, tmp0, tmp1
+	quad_round	b, const8 , dig0, dig1,  msg0, msg1, tmp0, tmp1
+	quad_round	b, const9 , dig0, dig1,  msg1, msg0, tmp0, tmp1
+	quad_round	b, const10, dig0, dig1,  msg0, msg1, tmp0, tmp1
+	quad_round	b, const11, dig0, dig1,  msg1, msg0, tmp0, tmp1
+	quad_round	b, const4 , dig0, dig1,  msg0, msg1, tmp0, tmp1
+	quad_round	b, const5 , dig0, dig1,  msg1, msg0, tmp0, tmp1
+	quad_round	b, const6 , dig0, dig1,  msg0, msg1, tmp0, tmp1
+	quad_round	b, const7 , dig0, dig1,  msg1, msg0, tmp0, tmp1,1
+
+	bcc	start_loop
+
+	//rev128
+	.irp	j,0,1,2,3
+		do_ext		\j,dig0,dig0,dig0,#8
+		do_ext		\j,dig1,dig1,dig1,#8
+		do_rev64	\j,dig0,dig0
+		do_rev64	\j,dig1,dig1
+		do_st_digest	\j
+	.endr
+
+
+
+exit_ret:
+	ldp 	d10,d11,[sp,16]
+	ldp 	d12,d13,[sp,32]
+	ldp 	d14,d15,[sp,48]
+	ldp     d8, d9, [sp], 64
+	add	sp,sp,temp_buf_size
+	ret
+
+	.align	2
+.consts:
+	.word	0xce6228cb	// 3
+	.word	0xe7311465	// 2
+	.word	0xf3988a32	// 1
+	.word	0x79cc4519	// 0
+	.word	0xe6228cbc	// 7
+	.word	0x7311465e	// 6
+	.word	0x3988a32f	// 5
+	.word	0x9cc45197	// 4
+	.word	0x6228cbce	//11
+	.word	0x311465e7	//10
+	.word	0x988a32f3	// 9
+	.word	0xcc451979	// 8
+	.word	0x228cbce6	//15
+	.word	0x11465e73	//14
+	.word	0x88a32f39	//13
+	.word	0xc451979c	//12
+	.word	0xec53d43c	//19
+	.word	0x7629ea1e	//18
+	.word	0x3b14f50f	//17
+	.word	0x9d8a7a87	//16
+	.word	0xc53d43ce	//23
+	.word	0x629ea1e7	//22
+	.word	0xb14f50f3	//21
+	.word	0xd8a7a879	//20
+	.word	0x53d43cec	//27
+	.word	0x29ea1e76	//26
+	.word	0x14f50f3b	//25
+	.word	0x8a7a879d	//24
+	.word	0x3d43cec5	//31
+	.word	0x9ea1e762	//30
+	.word	0x4f50f3b1	//29
+	.word	0xa7a879d8	//28
+	.word	0xd43cec53	//35
+	.word	0xea1e7629	//34
+	.word	0xf50f3b14	//33
+	.word	0x7a879d8a	//32
+	.word	0x43cec53d	//39
+	.word	0xa1e7629e	//38
+	.word	0x50f3b14f	//37
+	.word	0xa879d8a7	//36
+	.word	0x3cec53d4	//43
+	.word	0x1e7629ea	//42
+	.word	0x0f3b14f5	//41
+	.word	0x879d8a7a	//40
+	.word	0xcec53d43	//47
+	.word	0xe7629ea1	//46
+	.word	0xf3b14f50	//45
+	.word	0x79d8a7a8	//44
+	.word	0xec53d43c	//51
+	.word	0x7629ea1e	//50
+	.word	0x3b14f50f	//49
+
+
+	.size	sm3_mb_sm_x4, .-sm3_mb_sm_x4
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c
new file mode 100644
index 000000000..b1c6ee26b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c
@@ -0,0 +1,284 @@
+/**********************************************************************
+  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sm3_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SM3_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len);
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx);
+
+void sm3_mb_mgr_init_avx2(SM3_MB_JOB_MGR * state);
+SM3_JOB *sm3_mb_mgr_submit_avx2(SM3_MB_JOB_MGR * state, SM3_JOB * job);
+SM3_JOB *sm3_mb_mgr_flush_avx2(SM3_MB_JOB_MGR * state);
+
+void sm3_mb_mgr_init_avx2(SM3_MB_JOB_MGR * state)
+{
+	unsigned int j;
+	state->unused_lanes = 0xF76543210;
+	state->num_lanes_inuse = 0;
+	for (j = 0; j < SM3_X8_LANES; j++) {
+		state->lens[j] = 0;
+		state->ldata[j].job_in_lane = 0;
+	}
+}
+
+void sm3_ctx_mgr_init_avx2(SM3_HASH_CTX_MGR * mgr)
+{
+	sm3_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_submit_avx2(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+				      const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	// Store the user's request flags and mark this ctx as currently being processed.
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+	// Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+	if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sm3_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_flush_avx2(SM3_HASH_CTX_MGR * mgr)
+{
+	SM3_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_avx2(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sm3_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sm3_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SM3_HASH_CTX_MGR still need processing. Loop.
+	}
+}
+
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			unsigned int j;
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+				ctx->job.result_digest[j] =
+				    byteswap32(ctx->job.result_digest[j]);
+			}
+			return ctx;
+		}
+		// If the extra blocks are empty, begin hashing what remains in the user's buffer.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+			uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			// len should be a multiple of the block size now
+			assert((len % SM3_BLOCK_SIZE) == 0);
+
+			// Set len to the number of blocks to be hashed in the user's buffer
+			len >>= SM3_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx2(&mgr->mgr,
+									      &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline void hash_init_digest(SM3_WORD_T * digest)
+{
+	static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] =
+	    { SM3_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SM3_PADLENGTHFIELD_SIZE;
+
+#if SM3_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SM3_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+
+struct slver sm3_ctx_mgr_init_avx2_slver_0000;
+struct slver sm3_ctx_mgr_init_avx2_slver = { 0x2309, 0x00, 0x00 };
+
+struct slver sm3_ctx_mgr_submit_avx2_slver_0000;
+struct slver sm3_ctx_mgr_submit_avx2_slver = { 0x230a, 0x00, 0x00 };
+
+struct slver sm3_ctx_mgr_flush_avx2_slver_0000;
+struct slver sm3_ctx_mgr_flush_avx2_slver = { 0x230b, 0x00, 0x00 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c
new file mode 100644
index 000000000..8169aa170
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c
@@ -0,0 +1,292 @@
+/**********************************************************************
+  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sm3_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(SM3_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len);
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx);
+
+void sm3_mb_mgr_init_avx512(SM3_MB_JOB_MGR * state);
+SM3_JOB *sm3_mb_mgr_submit_avx512(SM3_MB_JOB_MGR * state, SM3_JOB * job);
+SM3_JOB *sm3_mb_mgr_flush_avx512(SM3_MB_JOB_MGR * state);
+
+void sm3_mb_mgr_init_avx512(SM3_MB_JOB_MGR * state)
+{
+	unsigned int j;
+	state->unused_lanes = 0xfedcba9876543210;
+	state->num_lanes_inuse = 0;
+	for (j = 0; j < SM3_MAX_LANES; j++) {
+		state->lens[j] = 0;
+		state->ldata[j].job_in_lane = 0;
+	}
+}
+
+void sm3_ctx_mgr_init_avx512(SM3_HASH_CTX_MGR * mgr)
+{
+	sm3_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_submit_avx512(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+					const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		// Cannot submit to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		// Init digest
+		hash_init_digest(ctx->job.result_digest);
+
+		// Reset byte counter
+		ctx->total_length = 0;
+
+		// Clear extra blocks
+		ctx->partial_block_buffer_length = 0;
+	}
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Store buffer ptr info from user
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	ctx->status = (flags & HASH_LAST) ?
+	    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+	    HASH_CTX_STS_PROCESSING;
+
+	// Advance byte counter
+	ctx->total_length += len;
+
+	// if partial_block_buffer_length != 0 means ctx get extra data
+	// len < SM3_BLOCK_SIZE means data len < SM3_BLOCK_SIZE
+	if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) {
+		// Compute how many bytes to copy from user buffer into extra block
+		uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			// Copy and update relevant pointers and counters
+			memcpy_varlen(&ctx->partial_block_buffer
+				      [ctx->partial_block_buffer_length], buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+		// The extra block should never contain more than 1 block here
+		assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE);
+
+		// If the extra block buffer contains exactly 1 block, it can be hashed.
+		if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) {
+
+			ctx->partial_block_buffer_length = 0;
+			ctx->job.buffer = ctx->partial_block_buffer;
+
+			ctx->job.len = 1;
+			ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+		}
+
+	}
+
+	return sm3_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			unsigned int j;
+			ctx->status = HASH_CTX_STS_COMPLETE;	// Clear PROCESSING bit
+			for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+				ctx->job.result_digest[j] =
+				    byteswap32(ctx->job.result_digest[j]);
+			}
+			return ctx;
+		}
+		// partial_block_buffer_length must be 0 that means incoming_buffer_length have not be init.
+		if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+
+			// copy_len will check len % SM3_BLOCK_SIZE ?= 0
+			uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1);
+
+			// if mod SM3_BLOCK_SIZE != 0
+			if (copy_len) {
+				len -= copy_len;
+				memcpy_varlen(ctx->partial_block_buffer,
+					      ((const char *)buffer + len), copy_len);
+				// store the extra data
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+			// after len -= copy_len or copy_len == 0
+			assert((len % SM3_BLOCK_SIZE) == 0);
+			// get the block size , eq len = len / 64
+			len >>= SM3_LOG2_BLOCK_SIZE;
+
+			if (len) {
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx =
+				    (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx512(&mgr->mgr,
+									      &ctx->job);
+				continue;
+			}
+		}
+		// If the extra blocks are not empty, then we are either on the last block(s)
+		// or we need more user input before continuing.
+		if (ctx->status & HASH_CTX_STS_LAST) {
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+			ctx->status =
+			    (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+			// todo make sure should return ?
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len)
+{
+	uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1));
+
+	memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	// Move i to the end of either 1st or 2nd extra block depending on length
+	i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) +
+	    1 + SM3_PADLENGTHFIELD_SIZE;
+
+#if SM3_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+	return i >> SM3_LOG2_BLOCK_SIZE;	// Number of extra blocks to hash
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_flush_avx512(SM3_HASH_CTX_MGR * mgr)
+{
+
+	SM3_HASH_CTX *ctx;
+
+	while (1) {
+		ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_avx512(&mgr->mgr);
+
+		// If flush returned 0, there are no more jobs in flight.
+		if (!ctx)
+			return NULL;
+
+		// If flush returned a job, verify that it is safe to return to the user.
+		// If it is not ready, resubmit the job to finish processing.
+		ctx = sm3_ctx_mgr_resubmit(mgr, ctx);
+
+		// If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+		if (ctx)
+			return ctx;
+
+		// Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+	}
+
+}
+
+static inline void hash_init_digest(SM3_WORD_T * digest)
+{
+	static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] =
+	    { SM3_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+
+struct slver sm3_ctx_mgr_init_avx512_slver_0000;
+struct slver sm3_ctx_mgr_init_avx512_slver = { 0x2306, 0x00, 0x00 };
+
+struct slver sm3_ctx_mgr_submit_avx512_slver_0000;
+struct slver sm3_ctx_mgr_submit_avx512_slver = { 0x2307, 0x00, 0x00 };
+
+struct slver sm3_ctx_mgr_flush_avx512_slver_0000;
+struct slver sm3_ctx_mgr_flush_avx512_slver = { 0x2308, 0x00, 0x00 };
+
+#endif // HAVE_AS_KNOWS_AVX512
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c
new file mode 100644
index 000000000..e8fcfe08a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c
@@ -0,0 +1,314 @@
+/**********************************************************************
+  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sm3_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define rol32(x, r) (((x)<<(r)) | ((x)>>(32-(r))))
+
+static void sm3_init(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static uint32_t OPT_FIX sm3_update(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static void OPT_FIX sm3_final(SM3_HASH_CTX * ctx, uint32_t remain_len);
+static void OPT_FIX sm3_single(const volatile void *data, uint32_t digest[]);
+static inline void hash_init_digest(SM3_WORD_T * digest);
+
+static inline uint32_t P0(uint32_t X)
+{
+	return (X ^ (rol32(X, 9)) ^ (rol32(X, 17)));
+}
+
+static inline uint32_t P1(uint32_t X)
+{
+	return (X ^ (rol32(X, 15)) ^ (rol32(X, 23)));
+}
+
+static inline uint32_t sm3_ff(int j, uint32_t x, uint32_t y, uint32_t z)
+{
+	return j < 16 ? (x ^ y ^ z) : ((x & y) | (x & z) | (y & z));
+}
+
+static inline uint32_t sm3_gg(int j, uint32_t x, uint32_t y, uint32_t z)
+{
+	return j < 16 ? (x ^ y ^ z) : ((x & y) | ((~x) & z));
+}
+
+static inline void sm3_message_schedule(uint32_t bi[], volatile uint32_t W[],
+					volatile uint32_t W_B[])
+{
+	int j;
+	volatile uint32_t tmp;
+
+	for (j = 0; j <= 15; j++) {
+		W[j] = to_be32(bi[j]);
+	}
+
+	for (; j <= 67; j++) {
+		tmp = W[j - 16] ^ W[j - 9] ^ rol32(W[j - 3], 15);
+		W[j] = P1(tmp) ^ (rol32(W[j - 13], 7)) ^ W[j - 6];
+	}
+
+	for (j = 0; j < 64; j++) {
+		W_B[j] = W[j] ^ W[j + 4];
+	}
+
+	tmp = 0;
+}
+
+static inline void sm3_compress_step_func(int j, volatile uint32_t * a_p,
+					  volatile uint32_t * b_p, volatile uint32_t * c_p,
+					  volatile uint32_t * d_p, volatile uint32_t * e_p,
+					  volatile uint32_t * f_p, volatile uint32_t * g_p,
+					  volatile uint32_t * h_p, volatile uint32_t W[],
+					  volatile uint32_t W_B[])
+{
+	volatile uint32_t SS1, SS2, TT1, TT2;
+	uint32_t T = j < 16 ? 0x79cc4519 : 0x7a879d8a;
+
+	SS1 = rol32(rol32(*a_p, 12) + *e_p + rol32(T, (j % 32)), 7);
+	SS2 = SS1 ^ rol32(*a_p, 12);
+	TT1 = sm3_ff(j, *a_p, *b_p, *c_p) + *d_p + SS2 + W_B[j];
+	TT2 = sm3_gg(j, *e_p, *f_p, *g_p) + *h_p + SS1 + W[j];
+	*d_p = *c_p;
+	*c_p = rol32(*b_p, 9);
+	*b_p = *a_p;
+	*a_p = TT1;
+	*h_p = *g_p;
+	*g_p = rol32(*f_p, 19);
+	*f_p = *e_p;
+	*e_p = P0(TT2);
+
+	SS1 = 0;
+	SS2 = 0;
+	TT1 = 0;
+	TT2 = 0;
+}
+
+void sm3_ctx_mgr_init_base(SM3_HASH_CTX_MGR * mgr)
+{
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_submit_base(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+				      const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	uint32_t remain_len;
+
+	if (flags & (~HASH_ENTIRE)) {
+		// User should not pass anything other than FIRST, UPDATE, or LAST
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) {
+		// Cannot submit a new entire job to a currently processing job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		// Cannot update a finished job.
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags == HASH_FIRST) {
+		if (len % SM3_BLOCK_SIZE != 0) {
+			ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+			return ctx;
+		}
+		sm3_init(ctx, buffer, len);
+		sm3_update(ctx, buffer, len);
+	}
+
+	if (flags == HASH_UPDATE) {
+		if (len % SM3_BLOCK_SIZE != 0) {
+			ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+			return ctx;
+		}
+		sm3_update(ctx, buffer, len);
+	}
+
+	if (flags == HASH_LAST) {
+		remain_len = sm3_update(ctx, buffer, len);
+		sm3_final(ctx, remain_len);
+	}
+
+	if (flags == HASH_ENTIRE) {
+		sm3_init(ctx, buffer, len);
+		remain_len = sm3_update(ctx, buffer, len);
+		sm3_final(ctx, remain_len);
+	}
+
+	return ctx;
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_flush_base(SM3_HASH_CTX_MGR * mgr)
+{
+	return NULL;
+}
+
+static void sm3_init(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+	// Init digest
+	hash_init_digest(ctx->job.result_digest);
+
+	// Reset byte counter
+	ctx->total_length = 0;
+
+	// Clear extra blocks
+	ctx->partial_block_buffer_length = 0;
+
+	// If we made it here, there were no errors during this call to submit
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	// Mark it as processing
+	ctx->status = HASH_CTX_STS_PROCESSING;
+}
+
+static uint32_t sm3_update(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+	uint32_t remain_len = len;
+	uint32_t *digest = ctx->job.result_digest;
+
+	while (remain_len >= SM3_BLOCK_SIZE) {
+		sm3_single(buffer, digest);
+		buffer = (void *)((uint8_t *) buffer + SM3_BLOCK_SIZE);
+		remain_len -= SM3_BLOCK_SIZE;
+		ctx->total_length += SM3_BLOCK_SIZE;
+	}
+
+	ctx->incoming_buffer = buffer;
+	return remain_len;
+}
+
+static void sm3_final(SM3_HASH_CTX * ctx, uint32_t remain_len)
+{
+	const void *buffer = ctx->incoming_buffer;
+	uint32_t i = remain_len;
+	uint32_t j;
+	volatile uint8_t buf[2 * SM3_BLOCK_SIZE] = { 0 };
+	uint32_t *digest = ctx->job.result_digest;
+
+	ctx->total_length += i;
+	memcpy((void *)buf, buffer, i);
+	buf[i++] = 0x80;
+
+	i = (i > SM3_BLOCK_SIZE - SM3_PADLENGTHFIELD_SIZE ?
+	     2 * SM3_BLOCK_SIZE : SM3_BLOCK_SIZE);
+
+	*(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8);
+
+	sm3_single(buf, digest);
+	if (i == 2 * SM3_BLOCK_SIZE) {
+		sm3_single(buf + SM3_BLOCK_SIZE, digest);
+	}
+
+	/* convert to small-endian for words */
+	for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+		digest[j] = byteswap32(digest[j]);
+	}
+
+	ctx->status = HASH_CTX_STS_COMPLETE;
+	memset((void *)buf, 0, sizeof(buf));
+}
+
+static void sm3_single(const volatile void *data, uint32_t digest[])
+{
+	volatile uint32_t a, b, c, d, e, f, g, h;
+	volatile uint32_t W[68], W_bar[64];
+	int j;
+
+	a = digest[0];
+	b = digest[1];
+	c = digest[2];
+	d = digest[3];
+	e = digest[4];
+	f = digest[5];
+	g = digest[6];
+	h = digest[7];
+
+	sm3_message_schedule((uint32_t *) data, W, W_bar);
+	for (j = 0; j < 64; j++) {
+		sm3_compress_step_func(j, &a, &b, &c, &d, &e, &f, &g, &h, W, W_bar);
+	}
+
+	digest[0] ^= a;
+	digest[1] ^= b;
+	digest[2] ^= c;
+	digest[3] ^= d;
+	digest[4] ^= e;
+	digest[5] ^= f;
+	digest[6] ^= g;
+	digest[7] ^= h;
+
+	memset((void *)W, 0, sizeof(W));
+	memset((void *)W_bar, 0, sizeof(W_bar));
+
+	a = 0;
+	b = 0;
+	c = 0;
+	d = 0;
+	e = 0;
+	f = 0;
+	g = 0;
+	h = 0;
+}
+
+static inline void hash_init_digest(SM3_WORD_T * digest)
+{
+	static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] =
+	    { SM3_INITIAL_DIGEST };
+	memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+struct slver {
+	uint16_t snum;
+	uint8_t ver;
+	uint8_t core;
+};
+struct slver sm3_ctx_mgr_init_base_slver_0000;
+struct slver sm3_ctx_mgr_init_base_slver = { 0x2303, 0x00, 0x00 };
+
+struct slver sm3_ctx_mgr_submit_base_slver_0000;
+struct slver sm3_ctx_mgr_submit_base_slver = { 0x2304, 0x00, 0x00 };
+
+struct slver sm3_ctx_mgr_flush_base_slver_0000;
+struct slver sm3_ctx_mgr_flush_base_slver = { 0x2305, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c
new file mode 100644
index 000000000..d74a4c882
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c
@@ -0,0 +1,54 @@
+/**********************************************************************
+  Copyright(c) 2019 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdint.h>
+#include <string.h>
+#include "sm3_mb.h"
+#include "memcpy_inline.h"
+
+extern void sm3_ctx_mgr_init_base(SM3_HASH_CTX_MGR * mgr);
+extern SM3_HASH_CTX *sm3_ctx_mgr_submit_base(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+					     const void *buffer, uint32_t len,
+					     HASH_CTX_FLAG flags);
+extern SM3_HASH_CTX *sm3_ctx_mgr_flush_base(SM3_HASH_CTX_MGR * mgr);
+
+void sm3_ctx_mgr_init(SM3_HASH_CTX_MGR * mgr)
+{
+	return sm3_ctx_mgr_init_base(mgr);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_submit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+				 const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+	return sm3_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_flush(SM3_HASH_CTX_MGR * mgr)
+{
+	return sm3_ctx_mgr_flush_base(mgr);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm
new file mode 100644
index 000000000..0f2a0f39a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm
@@ -0,0 +1,65 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN		0
+%define STS_BEING_PROCESSED	1
+%define STS_COMPLETED		2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Threshold constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; if number of lanes in use <= threshold, using sb func
+%define SM3_SB_THRESHOLD_SSE		1
+%define SM3_SB_THRESHOLD_AVX		1
+%define SM3_SB_THRESHOLD_AVX2	1
+%define SM3_SB_THRESHOLD_AVX512	1
+%define SM3_NI_SB_THRESHOLD_SSE	4 ; shani is faster than sse sha256_mb
+%define SM3_NI_SB_THRESHOLD_AVX512	6
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA256_JOB structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS	; SHA256_JOB
+
+;;;	name				size	align
+FIELD	_buffer,			8,	8	; pointer to buffer
+FIELD	_len,				8,	8	; length in bytes
+FIELD	_result_digest,			8*4,	64	; Digest (output)
+FIELD	_status,			4,	4
+FIELD	_user_data,			8,	8
+
+%assign _SM3_JOB_size	_FIELD_OFFSET
+%assign _SM3_JOB_align	_STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c
new file mode 100644
index 000000000..fbbb2a1a7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c
@@ -0,0 +1,145 @@
+/**********************************************************************
+  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define ISAL_UNIT_TEST
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS (SM3_MAX_LANES - 1)
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint8_t digest_ref[TEST_BUFS][4 * SM3_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+uint8_t lens_print_and_check(SM3_HASH_CTX_MGR * mgr)
+{
+	static int32_t last_lens[SM3_MAX_LANES] = { 0 };
+	int32_t len;
+	uint8_t num_unchanged = 0;
+	int i;
+	for (i = 0; i < SM3_MAX_LANES; i++) {
+		len = (int32_t) mgr->mgr.lens[i];
+		// len[i] in mgr consists of byte_length<<4 | lane_index
+		len = (len >= 16) ? (len >> 4 << 6) : 0;
+		printf("\t%d", len);
+		if (last_lens[i] > 0 && last_lens[i] == len)
+			num_unchanged += 1;
+		last_lens[i] = len;
+	}
+	printf("\n");
+	return num_unchanged;
+}
+
+int main(void)
+{
+	SM3_HASH_CTX_MGR *mgr = NULL;
+	SM3_HASH_CTX ctxpool[TEST_BUFS];
+	uint32_t i, j, fail = 0;
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t lens[TEST_BUFS];
+	uint8_t num_ret, num_unchanged = 0;
+	int ret;
+
+	printf("sm3_mb flush test, %d buffers with %d length: \n", TEST_BUFS, TEST_LEN);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sm3_ctx_mgr_init(mgr);
+
+	srand(TEST_SEED);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocate  and fill buffer
+		lens[i] = TEST_LEN / SM3_MAX_LANES * (i + 1);
+		bufs[i] = (unsigned char *)malloc(lens[i]);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], lens[i]);
+	}
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Init ctx contexts
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// Run reference test
+		sm3_ossl(bufs[i], lens[i], digest_ref[i]);
+
+		// Run sb_sm3 test
+		sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+	}
+
+	printf("Changes of lens inside mgr:\n");
+	lens_print_and_check(mgr);
+	while (sm3_ctx_mgr_flush(mgr)) {
+		num_ret = lens_print_and_check(mgr);
+		num_unchanged = num_unchanged > num_ret ? num_unchanged : num_ret;
+	}
+	printf("Info of sm3_mb lens prints over\n");
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_le32(((uint32_t *) digest_ref[i])[j])) {
+				fail++;
+				printf("Test%d fixed size, digest%d "
+				       "fail 0x%08X <=> 0x%08X \n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       to_le32(((uint32_t *) digest_ref[i])[j]));
+			}
+		}
+	}
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf("Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..a2319ba14
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm
@@ -0,0 +1,77 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SM3 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; LANE_DATA
+;;;     name            size    align
+FIELD   _job_in_lane,   8,      8       ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align        _STRUCT_ALIGN
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS    ; SM3_ARGS_X16
+;;;     name            size    align
+FIELD   _digest,        4*8*16,  4       ; transposed digest
+FIELD   _data_ptr,      8*16,    8       ; array of pointers to data
+END_FIELDS
+
+%assign _SM3_ARGS_X4_size    _FIELD_OFFSET
+%assign _SM3_ARGS_X4_align   _STRUCT_ALIGN
+%assign _SM3_ARGS_X8_size	_FIELD_OFFSET
+%assign _SM3_ARGS_X8_align	_STRUCT_ALIGN
+%assign _SM3_ARGS_X16_size	_FIELD_OFFSET
+%assign _SM3_ARGS_X16_align	_STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+START_FIELDS    ; MB_MGR
+;;;     name            size    align
+FIELD   _args,          _SM3_ARGS_X4_size, _SM3_ARGS_X4_align
+FIELD   _lens,          4*16,    8
+FIELD   _unused_lanes,  8,      8
+FIELD   _ldata,         _LANE_DATA_size*16, _LANE_DATA_align
+FIELD   _num_lanes_inuse, 4,    4
+END_FIELDS
+
+%assign _MB_MGR_size    _FIELD_OFFSET
+%assign _MB_MGR_align   _STRUCT_ALIGN
+
+_args_digest    equ     _args + _digest
+_args_data_ptr  equ     _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..b87bdcba8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm
@@ -0,0 +1,258 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sm3_job.asm"
+%include "sm3_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sm3_mb_x8_avx2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+%define tmp4    rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%define tmp4    rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+; idx must be a register not clobberred by sm3_mb_x8_avx2
+%define idx             rbp
+
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*8
+_ALIGN_SIZE     equ 8
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SM3_JOB* sm3_mb_mgr_flush_avx2(SM3_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sm3_mb_mgr_flush_avx2, function
+sm3_mb_mgr_flush_avx2:
+	endbranch
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*3], rbp
+	mov     [rsp + _GPR_SAVE + 8*4], r12
+	mov     [rsp + _GPR_SAVE + 8*5], r13
+	mov     [rsp + _GPR_SAVE + 8*6], r14
+	mov     [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*1], rsi
+	mov     [rsp + _GPR_SAVE + 8*2], rdi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	; use num_lanes_inuse to judge all lanes are empty
+	cmp	dword [state + _num_lanes_inuse], 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor	idx, idx
+	cmp	qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [one]
+	cmp	qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [two]
+	cmp	qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [three]
+	cmp	qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [four]
+	cmp	qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [five]
+	cmp	qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [six]
+	cmp	qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [seven]
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov	tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _args + _data_ptr + 8*I], tmp
+	mov	dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	vmovdqa xmm0, [state + _lens + 0*16]
+	vmovdqa xmm1, [state + _lens + 1*16]
+
+	vpminud xmm2, xmm0, xmm1        ; xmm2 has {D,C,B,A}
+	vpalignr xmm3, xmm3, xmm2, 8    ; xmm3 has {x,x,D,C}
+	vpminud xmm2, xmm2, xmm3        ; xmm2 has {x,x,E,F}
+	vpalignr xmm3, xmm3, xmm2, 4    ; xmm3 has {x,x,x,E}
+	vpminud xmm2, xmm2, xmm3        ; xmm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+mb_processing:
+
+	vpand   xmm2, xmm2, [rel clear_low_nibble]
+	vpshufd xmm2, xmm2, 0
+
+	vpsubd  xmm0, xmm0, xmm2
+	vpsubd  xmm1, xmm1, xmm2
+
+	vmovdqa [state + _lens + 0*16], xmm0
+	vmovdqa [state + _lens + 1*16], xmm1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sm3_mb_x8_avx2
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	mov	unused_lanes, [state + _unused_lanes]
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+	sub     dword [state + _num_lanes_inuse], 1
+
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*4*8]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*4*8], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*4*8], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*4*8], 3
+	vmovd	xmm1, [state + _args_digest + 4*idx + 4*4*8]
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 5*4*8], 1
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 6*4*8], 2
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 7*4*8], 3
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+	vmovdqa	[job_rax + _result_digest + 1*16], xmm1
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*1]
+	mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     rbp, [rsp + _GPR_SAVE + 8*3]
+	mov     r12, [rsp + _GPR_SAVE + 8*4]
+	mov     r13, [rsp + _GPR_SAVE + 8*5]
+	mov     r14, [rsp + _GPR_SAVE + 8*6]
+	mov     r15, [rsp + _GPR_SAVE + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+one:	dq  1
+two:	dq  2
+three:	dq  3
+four:	dq  4
+five:	dq  5
+six:	dq  6
+seven:	dq  7
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..7feada49f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx512.asm
@@ -0,0 +1,276 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sm3_job.asm"
+%include "sm3_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sm3_mb_x16_avx512
+;extern sm3_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+	%define arg1    rdi ; rcx
+	%define arg2    rsi ; rdx
+	%define tmp4    rdx
+%else
+	%define arg1    rcx
+	%define arg2    rdx
+	%define tmp4    rsi
+%endif
+
+
+; Common register definitions
+
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+%define idx             rbp
+
+%define num_lanes_inuse r9
+%define unused_lanes    rbx
+%define lane_data       rbx
+%define tmp2            rbx
+
+%define job_rax         rax
+%define tmp1            rax
+%define size_offset     rax
+%define tmp             rax
+%define start_offset    rax
+
+%define tmp3            arg1
+
+%define extra_blocks    arg2
+%define p               arg2
+
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  equ 10*16
+_GPR_SAVE_SIZE  equ 8*8
+_ALIGN_SIZE     equ 8
+
+_XMM_SAVE       equ 0
+_GPR_SAVE       equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+
+; SM3_JOB* sm3_mb_mgr_flush_avx512(SM3_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sm3_mb_mgr_flush_avx512, function
+sm3_mb_mgr_flush_avx512:
+	endbranch
+
+	; Save the stack
+	sub     rsp, STACK_SPACE
+	mov     [rsp + _GPR_SAVE + 8*0], rbx
+	mov     [rsp + _GPR_SAVE + 8*3], rbp
+	mov     [rsp + _GPR_SAVE + 8*4], r12
+	mov     [rsp + _GPR_SAVE + 8*5], r13
+	mov     [rsp + _GPR_SAVE + 8*6], r14
+	mov     [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + _GPR_SAVE + 8*1], rsi
+	mov     [rsp + _GPR_SAVE + 8*2], rdi
+	vmovdqa  [rsp + _XMM_SAVE + 16*0], xmm6
+	vmovdqa  [rsp + _XMM_SAVE + 16*1], xmm7
+	vmovdqa  [rsp + _XMM_SAVE + 16*2], xmm8
+	vmovdqa  [rsp + _XMM_SAVE + 16*3], xmm9
+	vmovdqa  [rsp + _XMM_SAVE + 16*4], xmm10
+	vmovdqa  [rsp + _XMM_SAVE + 16*5], xmm11
+	vmovdqa  [rsp + _XMM_SAVE + 16*6], xmm12
+	vmovdqa  [rsp + _XMM_SAVE + 16*7], xmm13
+	vmovdqa  [rsp + _XMM_SAVE + 16*8], xmm14
+	vmovdqa  [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	cmp	num_lanes_inuse, 0
+	jz	return_null
+
+	; find a lane with a non-null job
+	xor	idx, idx
+%assign I 1
+%rep 15
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	cmovne	idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+
+	; copy idx to empty lanes
+copy_lane_data:
+	mov	tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+	cmp	qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+	jne	APPEND(skip_,I)
+	mov	[state + _args + _data_ptr + 8*I], tmp
+	mov	dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+	; Find min length
+	vmovdqu ymm0, [state + _lens + 0*32]
+	vmovdqu ymm1, [state + _lens + 1*32]
+
+	vpminud ymm2, ymm0, ymm1        ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+	vpalignr ymm3, ymm3, ymm2, 8    ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+	vpalignr ymm3, ymm3, ymm2, 4    ; ymm3 has {x,x, x,H2,x,x, x,D2}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x, x,G3,x,x, x,C3}
+	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has {x,x, x, x,x,x, x,C3}
+        vpminud ymm2, ymm2, ymm3        ; ymm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+	; flush may check here and call x1
+
+mb_processing:
+
+	vpand ymm2, ymm2, [rel clear_low_nibble]
+	vpshufd ymm2, ymm2, 0
+	vpsubd ymm0, ymm0, ymm2
+	vpsubd ymm1, ymm1, ymm2
+
+	vmovdqu [state + _lens + 0*32], ymm0
+	vmovdqu [state + _lens + 1*32], ymm1
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sm3_mb_x16_avx512
+	; state and idx are intact
+
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	mov	unused_lanes, [state + _unused_lanes]
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+	mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	sub     num_lanes_inuse, 1
+	mov     [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*4*16]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+	vmovd	xmm1, [state + _args_digest + 4*idx + 4*4*16]
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+	vmovdqa	[job_rax + _result_digest + 1*16], xmm1
+
+
+; return back stack
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + _XMM_SAVE + 16*0]
+	vmovdqa  xmm7, [rsp + _XMM_SAVE + 16*1]
+	vmovdqa  xmm8, [rsp + _XMM_SAVE + 16*2]
+	vmovdqa  xmm9, [rsp + _XMM_SAVE + 16*3]
+	vmovdqa  xmm10, [rsp + _XMM_SAVE + 16*4]
+	vmovdqa  xmm11, [rsp + _XMM_SAVE + 16*5]
+	vmovdqa  xmm12, [rsp + _XMM_SAVE + 16*6]
+	vmovdqa  xmm13, [rsp + _XMM_SAVE + 16*7]
+	vmovdqa  xmm14, [rsp + _XMM_SAVE + 16*8]
+	vmovdqa  xmm15, [rsp + _XMM_SAVE + 16*9]
+	mov     rsi, [rsp + _GPR_SAVE + 8*1]
+	mov     rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+	mov     rbx, [rsp + _GPR_SAVE + 8*0]
+	mov     rbp, [rsp + _GPR_SAVE + 8*3]
+	mov     r12, [rsp + _GPR_SAVE + 8*4]
+	mov     r13, [rsp + _GPR_SAVE + 8*5]
+	mov     r14, [rsp + _GPR_SAVE + 8*6]
+	mov     r15, [rsp + _GPR_SAVE + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1:     dq  1
+lane_2:     dq  2
+lane_3:     dq  3
+lane_4:     dq  4
+lane_5:     dq  5
+lane_6:     dq  6
+lane_7:     dq  7
+lane_8:     dq  8
+lane_9:     dq  9
+lane_10:    dq  10
+lane_11:    dq  11
+lane_12:    dq  12
+lane_13:    dq  13
+lane_14:    dq  14
+lane_15:    dq  15
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sm3_mb_mgr_flush_avx512
+no_sm3_mb_mgr_flush_avx512:
+%endif
+
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..ae95faa89
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm
@@ -0,0 +1,247 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sm3_job.asm"
+%include "memcpy.asm"
+%include "sm3_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sm3_mb_x8_avx2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1    rdi ; rcx
+%define arg2    rsi ; rdx
+
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+%define p2      arg2
+
+%define idx             r8
+%define last_len        r8
+%define p               r11
+%define start_offset    r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+
+%define tmp             r9
+
+%define lane_data       r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE	8*8 + 16*10 + 8
+
+; SM3_JOB* sm3_mb_mgr_submit_avx2(SM3_MB_JOB_MGR *state, SM3_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sm3_mb_mgr_submit_avx2, function
+sm3_mb_mgr_submit_avx2:
+	endbranch
+
+	sub     rsp, STACK_SPACE
+	mov     [rsp + 8*0], rbx
+	mov     [rsp + 8*3], rbp
+	mov     [rsp + 8*4], r12
+	mov     [rsp + 8*5], r13
+	mov     [rsp + 8*6], r14
+	mov     [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + 8*1], rsi
+	mov     [rsp + 8*2], rdi
+	vmovdqa  [rsp + 8*8 + 16*0], xmm6
+	vmovdqa  [rsp + 8*8 + 16*1], xmm7
+	vmovdqa  [rsp + 8*8 + 16*2], xmm8
+	vmovdqa  [rsp + 8*8 + 16*3], xmm9
+	vmovdqa  [rsp + 8*8 + 16*4], xmm10
+	vmovdqa  [rsp + 8*8 + 16*5], xmm11
+	vmovdqa  [rsp + 8*8 + 16*6], xmm12
+	vmovdqa  [rsp + 8*8 + 16*7], xmm13
+	vmovdqa  [rsp + 8*8 + 16*8], xmm14
+	vmovdqa  [rsp + 8*8 + 16*9], xmm15
+%endif
+	mov	unused_lanes, [state + _unused_lanes]
+	mov	lane, unused_lanes
+	and	lane, 0xF
+	shr	unused_lanes, 4
+	imul	lane_data, lane, _LANE_DATA_size
+	mov	dword [job + _status], STS_BEING_PROCESSED
+	lea	lane_data, [state + _ldata + lane_data]
+	mov	[state + _unused_lanes], unused_lanes
+	mov	DWORD(len), [job + _len]
+
+	shl	len, 4
+	or	len, lane
+	mov	[state + _lens + 4*lane], DWORD(len)
+
+	mov	[lane_data + _job_in_lane], job
+
+	; Load digest words from result_digest
+	vmovdqu	xmm0, [job + _result_digest + 0*16]
+	vmovdqu xmm1, [job + _result_digest + 1*16]
+	vmovd	[state + _args_digest + 4*lane + 0*4*8], xmm0
+	vpextrd	[state + _args_digest + 4*lane + 1*4*8], xmm0, 1
+	vpextrd	[state + _args_digest + 4*lane + 2*4*8], xmm0, 2
+	vpextrd	[state + _args_digest + 4*lane + 3*4*8], xmm0, 3
+	vmovd	[state + _args_digest + 4*lane + 4*4*8], xmm1
+	vpextrd	[state + _args_digest + 4*lane + 5*4*8], xmm1, 1
+	vpextrd	[state + _args_digest + 4*lane + 6*4*8], xmm1, 2
+	vpextrd	[state + _args_digest + 4*lane + 7*4*8], xmm1, 3
+
+
+	mov	p, [job + _buffer]
+	mov	[state + _args_data_ptr + 8*lane], p
+
+	add	dword [state + _num_lanes_inuse], 1
+	cmp	unused_lanes, 0xf
+	jne	return_null
+
+start_loop:
+	; Find min length
+	vmovdqa xmm0, [state + _lens + 0*16]
+	vmovdqa xmm1, [state + _lens + 1*16]
+
+	vpminud xmm2, xmm0, xmm1        ; xmm2 has {D,C,B,A}
+	vpalignr xmm3, xmm3, xmm2, 8    ; xmm3 has {x,x,D,C}
+	vpminud xmm2, xmm2, xmm3        ; xmm2 has {x,x,E,F}
+	vpalignr xmm3, xmm3, xmm2, 4    ; xmm3 has {x,x,x,E}
+	vpminud xmm2, xmm2, xmm3        ; xmm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+	vpand   xmm2, xmm2, [rel clear_low_nibble]
+	vpshufd xmm2, xmm2, 0
+
+	vpsubd  xmm0, xmm0, xmm2
+	vpsubd  xmm1, xmm1, xmm2
+
+	vmovdqa [state + _lens + 0*16], xmm0
+	vmovdqa [state + _lens + 1*16], xmm1
+
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sm3_mb_x8_avx2
+
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	unused_lanes, [state + _unused_lanes]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+	sub	dword [state + _num_lanes_inuse], 1
+
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*4*8]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*4*8], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*4*8], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*4*8], 3
+	vmovd	xmm1, [state + _args_digest + 4*idx + 4*4*8]
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 5*4*8], 1
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 6*4*8], 2
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 7*4*8], 3
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+	vmovdqa	[job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + 8*8 + 16*0]
+	vmovdqa  xmm7, [rsp + 8*8 + 16*1]
+	vmovdqa  xmm8, [rsp + 8*8 + 16*2]
+	vmovdqa  xmm9, [rsp + 8*8 + 16*3]
+	vmovdqa  xmm10, [rsp + 8*8 + 16*4]
+	vmovdqa  xmm11, [rsp + 8*8 + 16*5]
+	vmovdqa  xmm12, [rsp + 8*8 + 16*6]
+	vmovdqa  xmm13, [rsp + 8*8 + 16*7]
+	vmovdqa  xmm14, [rsp + 8*8 + 16*8]
+	vmovdqa  xmm15, [rsp + 8*8 + 16*9]
+	mov     rsi, [rsp + 8*1]
+	mov     rdi, [rsp + 8*2]
+%endif
+	mov     rbx, [rsp + 8*0]
+	mov     rbp, [rsp + 8*3]
+	mov     r12, [rsp + 8*4]
+	mov     r13, [rsp + 8*5]
+	mov     r14, [rsp + 8*6]
+	mov     r15, [rsp + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..7b7b21287
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx512.asm
@@ -0,0 +1,273 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sm3_job.asm"
+%include "memcpy.asm"
+%include "sm3_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+;
+; SM3_JOB* sm3_mb_mgr_submit_avx512 (SM3_MB_JOB_MGR *state, SM3_JOB* job);
+;
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+;todo sm3_mb_x16_avx512
+extern sm3_mb_x16_avx512
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define arg1    rdi ; state
+%define arg2    rsi ; job
+
+%define size_offset     rcx ; rdi
+%define tmp2            rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1    rcx ; state
+%define arg2    rdx ; job
+
+%define size_offset     rdi
+%define tmp2            rdi
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2 ;
+%define len2    arg2 ; + offset
+%define p2      arg2 ; need + offset
+
+%define idx             r8
+%define last_len        r8
+%define p               r11
+%define start_offset    r11
+%define num_lanes_inuse r11
+
+%define unused_lanes    rbx
+
+%define job_rax         rax
+%define len             rax
+
+%define lane            rbp
+%define tmp3            rbp
+
+%define tmp             r9
+
+%define lane_data       r10
+
+; todo make sure
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE	8*8 + 16*10 + 8
+
+mk_global sm3_mb_mgr_submit_avx512, function
+sm3_mb_mgr_submit_avx512:
+	endbranch
+
+	; save these registers
+	sub     rsp, STACK_SPACE
+	; rsp contain stack ptr , mov to stack bottom
+	mov     [rsp + 8*0], rbx
+	mov     [rsp + 8*3], rbp ; unuse 1 2
+	mov     [rsp + 8*4], r12
+	mov     [rsp + 8*5], r13
+	mov     [rsp + 8*6], r14
+	mov     [rsp + 8*7], r15
+	;mov rbx,rbp,r12,r13,r14,r15 to stack
+%ifidn __OUTPUT_FORMAT__, win64
+	mov     [rsp + 8*1], rsi
+	mov     [rsp + 8*2], rdi
+	vmovdqa  [rsp + 8*8 + 16*0], xmm6
+	vmovdqa  [rsp + 8*8 + 16*1], xmm7
+	vmovdqa  [rsp + 8*8 + 16*2], xmm8
+	vmovdqa  [rsp + 8*8 + 16*3], xmm9
+	vmovdqa  [rsp + 8*8 + 16*4], xmm10
+	vmovdqa  [rsp + 8*8 + 16*5], xmm11
+	vmovdqa  [rsp + 8*8 + 16*6], xmm12
+	vmovdqa  [rsp + 8*8 + 16*7], xmm13
+	vmovdqa  [rsp + 8*8 + 16*8], xmm14
+	vmovdqa  [rsp + 8*8 + 16*9], xmm15
+%endif
+	mov	unused_lanes, [state + _unused_lanes]
+	mov	lane, unused_lanes
+	; mov args to rbx and then mov rbx to rbp
+	; unused_lanes - rbx , lane - rbp both have already backup
+	and	lane, 0xF
+	; unless lane is 0x789abcdef, and return 0
+
+	shr	unused_lanes, 4
+	imul	lane_data, lane, _LANE_DATA_size
+	mov	dword [job + _status], STS_BEING_PROCESSED
+	lea	lane_data, [state + _ldata + lane_data]
+	mov	[state + _unused_lanes], unused_lanes
+	mov	DWORD(len), [job + _len]
+
+	shl	len, 4
+	or	len, lane
+	mov	[state + _lens + 4*lane], DWORD(len)
+
+	mov	[lane_data + _job_in_lane], job
+
+	; Load digest words from result_digest
+	vmovdqu	xmm0, [job + _result_digest + 0*16]
+	vmovdqu xmm1, [job + _result_digest + 1*16]
+	vmovd	[state + _args_digest + 4*lane + 0*4*16], xmm0
+	vpextrd	[state + _args_digest + 4*lane + 1*4*16], xmm0, 1
+	vpextrd	[state + _args_digest + 4*lane + 2*4*16], xmm0, 2
+	vpextrd	[state + _args_digest + 4*lane + 3*4*16], xmm0, 3
+	vmovd	[state + _args_digest + 4*lane + 4*4*16], xmm1
+	vpextrd	[state + _args_digest + 4*lane + 5*4*16], xmm1, 1
+	vpextrd	[state + _args_digest + 4*lane + 6*4*16], xmm1, 2
+	vpextrd	[state + _args_digest + 4*lane + 7*4*16], xmm1, 3
+
+
+	mov	p, [job + _buffer]
+	mov	[state + _args_data_ptr + 8*lane], p
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        add     num_lanes_inuse, 1
+	mov	[state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+	; eq jump
+        cmp     num_lanes_inuse, 16
+	jne	return_null
+
+start_loop:
+	; Find min length, ymm0 holds ahead 8, ymm1 holds rear 8
+	vmovdqu ymm0, [state + _lens + 0*32]
+	vmovdqu ymm1, [state + _lens + 1*32]
+
+	vpminud ymm2, ymm0, ymm1        ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+	vpalignr ymm3, ymm3, ymm2, 8    ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+	vpalignr ymm3, ymm3, ymm2, 4    ; ymm3 has {x,x, x,H2,x,x, x,D2}
+	vpminud ymm2, ymm2, ymm3        ; ymm2 has {x,x, x,G3,x,x, x,C3}
+	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has {x,x, x, x,x,x, x,C3}
+        vpminud ymm2, ymm2, ymm3        ; ymm2 has min value in low dword
+
+	vmovd   DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0xF
+	shr	len2, 4
+	jz	len_is_0
+
+        vpand   ymm2, ymm2, [rel clear_low_nibble]
+        vpshufd ymm2, ymm2, 0
+
+        vpsubd  ymm0, ymm0, ymm2
+        vpsubd  ymm1, ymm1, ymm2
+
+        vmovdqu [state + _lens + 0*32], ymm0
+        vmovdqu [state + _lens + 1*32], ymm1
+
+
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	sm3_mb_x16_avx512
+
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	unused_lanes, [state + _unused_lanes]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+	shl	unused_lanes, 4
+	or	unused_lanes, idx
+	mov	[state + _unused_lanes], unused_lanes
+
+        mov     DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+        sub     num_lanes_inuse, 1
+        mov     [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+	vmovd	xmm0, [state + _args_digest + 4*idx + 0*4*16]
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+	vpinsrd	xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+	vmovd	xmm1, [state + _args_digest + 4*idx + 4*4*16]
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+	vpinsrd	xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+	vmovdqa	[job_rax + _result_digest + 0*16], xmm0
+	vmovdqa	[job_rax + _result_digest + 1*16], xmm1
+
+; restore stack
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + 8*8 + 16*0]
+	vmovdqa  xmm7, [rsp + 8*8 + 16*1]
+	vmovdqa  xmm8, [rsp + 8*8 + 16*2]
+	vmovdqa  xmm9, [rsp + 8*8 + 16*3]
+	vmovdqa  xmm10, [rsp + 8*8 + 16*4]
+	vmovdqa  xmm11, [rsp + 8*8 + 16*5]
+	vmovdqa  xmm12, [rsp + 8*8 + 16*6]
+	vmovdqa  xmm13, [rsp + 8*8 + 16*7]
+	vmovdqa  xmm14, [rsp + 8*8 + 16*8]
+	vmovdqa  xmm15, [rsp + 8*8 + 16*9]
+	mov     rsi, [rsp + 8*1]
+	mov     rdi, [rsp + 8*2]
+%endif
+	mov     rbx, [rsp + 8*0]
+	mov     rbp, [rsp + 8*3]
+	mov     r12, [rsp + 8*4]
+	mov     r13, [rsp + 8*5]
+	mov     r14, [rsp + 8*6]
+	mov     r15, [rsp + 8*7]
+	add     rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+
+section .data align=32
+
+align 32
+clear_low_nibble:
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+	dq 0x00000000FFFFFFF0, 0x0000000000000000
+
+
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sm3_mb_mgr_submit_avx512
+no_sm3_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c
new file mode 100644
index 000000000..b904ba0ca
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c
@@ -0,0 +1,160 @@
+/**********************************************************************
+  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define ISAL_UNIT_TEST
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SM3_DIGEST_NWORDS];
+
+extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest);
+
+// Generates pseudo-random data
+static void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	SM3_HASH_CTX_MGR *mgr = NULL;
+	SM3_HASH_CTX ctxpool[TEST_BUFS];
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t i, j, fail = 0;
+	uint32_t lens[TEST_BUFS];
+	unsigned int jobs, t;
+	int ret;
+
+	printf("multibinary_sm3 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+	srand(TEST_SEED);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sm3_ctx_mgr_init(mgr);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocate and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// SSL test
+		sm3_ossl(bufs[i], TEST_LEN, digest_ssl[i]);
+
+		// sb_sm3 test
+		sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+	}
+
+	while (sm3_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_le32(((uint32_t *) digest_ssl[i])[j])) {
+				fail++;
+				printf("Test%d, digest%d fail %08X <=> %08X\n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       to_le32(((uint32_t *) digest_ssl[i])[j]));
+			}
+		}
+	}
+	putchar('.');
+
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		sm3_ctx_mgr_init(mgr);
+
+		for (i = 0; i < jobs; i++) {
+			// Random buffer with random len and contents
+			lens[i] = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], lens[i]);
+
+			// Run SSL test
+			sm3_ossl(bufs[i], lens[i], digest_ssl[i]);
+
+			// Run sb_sm3 test
+			sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+		}
+
+		while (sm3_ctx_mgr_flush(mgr)) ;
+
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] !=
+				    to_le32(((uint32_t *) digest_ssl[i])[j])) {
+					fail++;
+					printf("Test%d, digest%d fail %08X <=> %08X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       to_le32(((uint32_t *) digest_ssl[i])[j]));
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sm3_ssl rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c
new file mode 100644
index 000000000..3671a3b79
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c
@@ -0,0 +1,206 @@
+/**********************************************************************
+  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define ISAL_UNIT_TEST
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint8_t digest_ref[TEST_BUFS][4 * SM3_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest);
+
+// Generates pseudo-random data
+static void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	SM3_HASH_CTX_MGR *mgr = NULL;
+	SM3_HASH_CTX ctxpool[TEST_BUFS];
+	uint32_t i, j, fail = 0;
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t lens[TEST_BUFS];
+	unsigned int jobs, t;
+	uint8_t *tmp_buf;
+	int ret;
+
+	printf("multibinary_sm3 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sm3_ctx_mgr_init(mgr);
+
+	srand(TEST_SEED);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocate  and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contexts
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// Run reference test
+		sm3_ossl(bufs[i], TEST_LEN, digest_ref[i]);
+
+		// Run sb_sm3 test
+		sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+	}
+
+	while (sm3_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_le32(((uint32_t *) digest_ref[i])[j])) {
+				fail++;
+				printf("Test%d fixed size, digest%d "
+				       "fail 0x%08X <=> 0x%08X \n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       to_le32(((uint32_t *) digest_ref[i])[j]));
+			}
+		}
+	}
+
+	if (fail) {
+		printf("Test failed function check %d\n", fail);
+		return fail;
+	}
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		sm3_ctx_mgr_init(mgr);
+
+		for (i = 0; i < jobs; i++) {
+			// Use buffer with random len and contents
+			lens[i] = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], lens[i]);
+
+			// Run reference test
+			sm3_ossl(bufs[i], lens[i], digest_ref[i]);
+
+			// Run sm3_mb test
+			sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+		}
+
+		while (sm3_ctx_mgr_flush(mgr)) ;
+
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] !=
+				    to_le32(((uint32_t *) digest_ref[i])[j])) {
+					fail++;
+					printf("Test%d, digest%d fail "
+					       "0x%08X <=> 0x%08X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       to_le32(((uint32_t *) digest_ref[i])[j]));
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	// Test at the end of buffer
+	jobs = rand() % TEST_BUFS;
+	tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+	if (!tmp_buf) {
+		printf("malloc failed, end test aborted.\n");
+		return 1;
+	}
+
+	rand_buffer(tmp_buf, jobs);
+
+	sm3_ctx_mgr_init(mgr);
+
+	// Extend to the end of allocated buffer to construct jobs
+	for (i = 0; i < jobs; i++) {
+		bufs[i] = (uint8_t *) & tmp_buf[i];
+		lens[i] = jobs - i;
+
+		// Reference test
+		sm3_ossl(bufs[i], lens[i], digest_ref[i]);
+
+		// sb_sm3 test
+		sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+	}
+
+	while (sm3_ctx_mgr_flush(mgr)) ;
+
+	for (i = 0; i < jobs; i++) {
+		for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_le32(((uint32_t *) digest_ref[i])[j])) {
+				fail++;
+				printf("End test failed at offset %d - result: 0x%08X"
+				       ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j],
+				       to_le32(((uint32_t *) digest_ref[i])[j]));
+			}
+		}
+	}
+
+	putchar('.');
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sm3 rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c
new file mode 100644
index 000000000..64e583ffc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c
@@ -0,0 +1,298 @@
+/**********************************************************************
+  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define ISAL_UNIT_TEST
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN  (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS  10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE		13*SM3_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS 	(TEST_LEN/(16*SM3_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ref[TEST_BUFS][4 * SM3_DIGEST_NWORDS];
+extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest);
+
+// Generates pseudo-random data
+static void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+	long i;
+	for (i = 0; i < buffer_size; i++)
+		buf[i] = rand();
+}
+
+int main(void)
+{
+	SM3_HASH_CTX_MGR *mgr = NULL;
+	SM3_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+	uint32_t i, j, fail = 0;
+	int len_done, len_rem, len_rand;
+	unsigned char *bufs[TEST_BUFS];
+	unsigned char *buf_ptr[TEST_BUFS];
+	uint32_t lens[TEST_BUFS];
+	unsigned int joblen, jobs, t;
+	int ret;
+
+	printf("multibinary_sm3_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+	       TEST_LEN);
+
+	srand(TEST_SEED);
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sm3_ctx_mgr_init(mgr);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		// Allocte and fill buffer
+		bufs[i] = (unsigned char *)malloc(TEST_LEN);
+		buf_ptr[i] = bufs[i];
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		rand_buffer(bufs[i], TEST_LEN);
+
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+
+		// Run reference test
+		sm3_ossl(bufs[i], TEST_LEN, digest_ref[i]);
+	}
+
+	// Run sb_sm3 tests
+	for (i = 0; i < TEST_BUFS;) {
+		len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+		len_rem = TEST_LEN - len_done;
+
+		if (len_done == 0)
+			ctx = sm3_ctx_mgr_submit(mgr,
+						 &ctxpool[i],
+						 buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+		else if (len_rem <= UPDATE_SIZE)
+			ctx = sm3_ctx_mgr_submit(mgr,
+						 &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+		else
+			ctx = sm3_ctx_mgr_submit(mgr,
+						 &ctxpool[i],
+						 buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+		// Add jobs while available or finished
+		if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+			i++;
+			continue;
+		}
+		// Resubmit unfinished job
+		i = (unsigned long)(ctx->user_data);
+		buf_ptr[i] += UPDATE_SIZE;
+	}
+
+	// Start flushing finished jobs, end on last flushed
+	ctx = sm3_ctx_mgr_flush(mgr);
+	while (ctx) {
+		if (hash_ctx_complete(ctx)) {
+			debug_char('-');
+			ctx = sm3_ctx_mgr_flush(mgr);
+			continue;
+		}
+		// Resubmit unfinished job
+		i = (unsigned long)(ctx->user_data);
+		buf_ptr[i] += UPDATE_SIZE;
+
+		len_done = (int)((unsigned long)buf_ptr[i]
+				 - (unsigned long)bufs[i]);
+		len_rem = TEST_LEN - len_done;
+
+		if (len_rem <= UPDATE_SIZE)
+			ctx = sm3_ctx_mgr_submit(mgr,
+						 &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+		else
+			ctx = sm3_ctx_mgr_submit(mgr,
+						 &ctxpool[i],
+						 buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+		if (ctx == NULL)
+			ctx = sm3_ctx_mgr_flush(mgr);
+	}
+
+	// Check digests
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_le32(((uint32_t *) digest_ref[i])[j])) {
+				fail++;
+				printf("Test%d fixed size, digest%d fail %8X <=> %8X",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       to_le32(((uint32_t *) digest_ref[i])[j]));
+			}
+		}
+	}
+	putchar('.');
+
+	// Run tests with random size and number of jobs
+	for (t = 0; t < RANDOMS; t++) {
+		jobs = rand() % (TEST_BUFS);
+
+		for (i = 0; i < jobs; i++) {
+			joblen = rand() % (TEST_LEN);
+			rand_buffer(bufs[i], joblen);
+			lens[i] = joblen;
+			buf_ptr[i] = bufs[i];
+			sm3_ossl(bufs[i], lens[i], digest_ref[i]);
+		}
+
+		sm3_ctx_mgr_init(mgr);
+
+		// Run sm3_sb jobs
+		i = 0;
+		while (i < jobs) {
+			// Submit a new job
+			len_rand = SM3_BLOCK_SIZE +
+			    SM3_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+			if (lens[i] > len_rand)
+				ctx = sm3_ctx_mgr_submit(mgr,
+							 &ctxpool[i],
+							 buf_ptr[i], len_rand, HASH_FIRST);
+			else
+				ctx = sm3_ctx_mgr_submit(mgr,
+							 &ctxpool[i],
+							 buf_ptr[i], lens[i], HASH_ENTIRE);
+
+			// Returned ctx could be:
+			//  - null context (we are just getting started and lanes aren't full yet), or
+			//  - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+			//  - an unfinished ctx, we will resubmit
+
+			if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+				i++;
+				continue;
+			} else {
+				// unfinished ctx returned, choose another random update length and submit either
+				// UPDATE or LAST depending on the amount of buffer remaining
+				while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+					j = (unsigned long)(ctx->user_data);	// Get index of the returned ctx
+					buf_ptr[j] = bufs[j] + ctx->total_length;
+					len_rand = (rand() % SM3_BLOCK_SIZE)
+					    * (rand() % MAX_RAND_UPDATE_BLOCKS);
+					len_rem = lens[j] - ctx->total_length;
+
+					if (len_rem <= len_rand)	// submit the rest of the job as LAST
+						ctx = sm3_ctx_mgr_submit(mgr,
+									 &ctxpool[j],
+									 buf_ptr[j],
+									 len_rem, HASH_LAST);
+					else	// submit the random update length as UPDATE
+						ctx = sm3_ctx_mgr_submit(mgr,
+									 &ctxpool[j],
+									 buf_ptr[j],
+									 len_rand,
+									 HASH_UPDATE);
+				}	// Either continue submitting any contexts returned here as UPDATE/LAST, or
+				// go back to submitting new jobs using the index i.
+
+				i++;
+			}
+		}
+
+		// Start flushing finished jobs, end on last flushed
+		ctx = sm3_ctx_mgr_flush(mgr);
+		while (ctx) {
+			if (hash_ctx_complete(ctx)) {
+				debug_char('-');
+				ctx = sm3_ctx_mgr_flush(mgr);
+				continue;
+			}
+			// Resubmit unfinished job
+			i = (unsigned long)(ctx->user_data);
+			buf_ptr[i] = bufs[i] + ctx->total_length;	// update buffer pointer
+			len_rem = lens[i] - ctx->total_length;
+			len_rand = (rand() % SM3_BLOCK_SIZE)
+			    * (rand() % MAX_RAND_UPDATE_BLOCKS);
+			debug_char('+');
+			if (len_rem <= len_rand)
+				ctx = sm3_ctx_mgr_submit(mgr,
+							 &ctxpool[i],
+							 buf_ptr[i], len_rem, HASH_LAST);
+			else
+				ctx = sm3_ctx_mgr_submit(mgr,
+							 &ctxpool[i],
+							 buf_ptr[i], len_rand, HASH_UPDATE);
+
+			if (ctx == NULL)
+				ctx = sm3_ctx_mgr_flush(mgr);
+		}
+
+		// Check result digest
+		for (i = 0; i < jobs; i++) {
+			for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] !=
+				    to_le32(((uint32_t *) digest_ref[i])[j])) {
+					fail++;
+					printf("Test%d, digest%d fail %8X <=> %8X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       to_le32(((uint32_t *) digest_ref[i])[j]));
+				}
+			}
+		}
+		if (fail) {
+			printf("Test failed function check %d\n", fail);
+			return fail;
+		}
+
+		putchar('.');
+		fflush(0);
+	}			// random test t
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sm3_update rand: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c
new file mode 100644
index 000000000..c409530c7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c
@@ -0,0 +1,250 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sm3_mb.h"
+
+typedef struct {
+	const char *msg;
+	uint32_t resultDigest[SM3_DIGEST_NWORDS];
+} TestData;
+
+static TestData test_data[] = {
+	{
+	 .msg = "abc",
+	 .resultDigest = {0xf4f0c766, 0xd9edee62, 0x6bd4f2d1, 0xe2e410dc,
+			  0x87c46741, 0xa2f7f25c, 0x2ba07d29, 0xe0a84b8f}
+	 },
+	{
+	 .msg = "abcdabcdabcdabcdabcdabcdabcdabcd" "abcdabcdabcdabcdabcdabcdabcdabcd",
+	 .resultDigest = {0xf99fbede, 0xa1b87522, 0x89486038, 0x4d5a8ec1,
+			  0xe570db6f, 0x65577e38, 0xa3cb3d29, 0x32570c9c}
+
+	 },
+	{
+	 .msg = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq",
+	 .resultDigest = {0xc56c9b63, 0x379e4de6, 0x92b190a3, 0xeaa14fdf,
+			  0x74ab2007, 0xb992f67f, 0x664e8cf3, 0x058c7bad}
+	 },
+
+	{.msg = "0123456789:;<=>?@ABCDEFGHIJKLMNO",
+	 .resultDigest = {0x076833d0, 0xd089ec39, 0xad857685, 0x8089797a,
+			  0x9df9e8fd, 0x4126eb9a, 0xf38c22e8, 0x054bb846}},
+	{
+	 .msg =
+	 "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+	 "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<",
+	 .resultDigest = {0x6cb9d38e, 0x846ac99e, 0x6d05634b, 0x3fe1bb26,
+			  0x90368c4b, 0xee8c4299, 0x08c0e96a, 0x2233cdc7}
+	 },
+	{
+	 .msg =
+	 "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+	 "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+	 "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR",
+	 .resultDigest = {0x83758189, 0x050f14d1, 0x91d8a730, 0x4a2825e4,
+			  0x11723273, 0x2114ee3f, 0x18cac172, 0xa9c5b07a}
+	 },
+	{
+	 .msg =
+	 "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+	 "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+	 "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+	 "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+	 "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?",
+	 .resultDigest = {0xb80f8aba, 0x55e96119, 0x851ac77b, 0xae31b3a5,
+			  0x1333e764, 0xc86ac40d, 0x34878db1, 0x7da873f6},
+	 },
+	{
+	 .msg =
+	 "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+	 "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+	 "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+	 "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+	 "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+	 "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU",
+	 .resultDigest = {0xbd5736a7, 0x55977d13, 0xa950c78a, 0x71eeb7cb,
+			  0xe9ef0ba5, 0x95a9302e, 0x155e5c33, 0xad96ce3c}
+	 },
+	{
+	 .msg = "",
+	 .resultDigest = {0x831db21a, 0x7fa1cf55, 0x4819618e, 0x8f1ae831,
+			  0xc7c8be22, 0x74fbfe28, 0xeb35d07e, 0x2baa8250}
+
+	 },
+
+};
+
+#define MSGS sizeof(test_data)/sizeof(TestData)
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+int main(void)
+{
+
+	SM3_HASH_CTX_MGR *mgr = NULL;
+	SM3_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+	uint32_t i, j, k, t, checked = 0;
+	uint32_t *good;
+	int ret;
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+	if (ret) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	sm3_ctx_mgr_init(mgr);
+	// Init contexts before first use
+	for (i = 0; i < MSGS; i++) {
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	for (i = 0; i < MSGS; i++) {
+		ctx = sm3_ctx_mgr_submit(mgr,
+					 &ctxpool[i], test_data[i].msg,
+					 strlen((char *)test_data[i].msg), HASH_ENTIRE);
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			good = test_data[t].resultDigest;
+			checked++;
+			for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the submit."
+				       " Error code: %d", ctx->error);
+				return -1;
+			}
+
+		}
+	}
+
+	while (1) {
+		ctx = sm3_ctx_mgr_flush(mgr);
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			good = test_data[t].resultDigest;
+			checked++;
+			for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the submit."
+				       " Error code: %d", ctx->error);
+				return -1;
+			}
+		} else {
+			break;
+		}
+	}
+
+	// do larger test in pseudo-random order
+
+	// Init contexts before first use
+	for (i = 0; i < NUM_JOBS; i++) {
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	checked = 0;
+	for (i = 0; i < NUM_JOBS; i++) {
+		j = PSEUDO_RANDOM_NUM(i);
+		ctx = sm3_ctx_mgr_submit(mgr,
+					 &ctxpool[i],
+					 test_data[j].msg, strlen((char *)test_data[j].msg),
+					 HASH_ENTIRE);
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+			good = test_data[k].resultDigest;
+			checked++;
+			for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the"
+				       " submit. Error code: %d", ctx->error);
+				return -1;
+			}
+
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+		}
+	}
+	while (1) {
+		ctx = sm3_ctx_mgr_flush(mgr);
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+			good = test_data[k].resultDigest;
+			checked++;
+			for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+				if (good[j] != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j], good[j]);
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the submit."
+				       " Error code: %d", ctx->error);
+				return -1;
+			}
+		} else {
+			break;
+		}
+	}
+
+	if (checked != NUM_JOBS) {
+		printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+		return -1;
+	}
+
+	printf(" multibinary_sm3 test: Pass\n");
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..ed4d9a092
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c
@@ -0,0 +1,128 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+#  define TEST_LEN     4*1024
+#  define TEST_LOOPS   10000
+#  define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     (GT_L3_CACHE / TEST_BUFS)
+#  define TEST_LOOPS   100
+#  define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest);
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SM3_DIGEST_NWORDS];
+
+int main(void)
+{
+	SM3_HASH_CTX_MGR *mgr = NULL;
+	SM3_HASH_CTX ctxpool[TEST_BUFS];
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t i, j, t, fail = 0;
+	struct perf start, stop;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+		if (bufs[i] == NULL) {
+			printf("calloc failed test aborted\n");
+			return 1;
+		}
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	int ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+	if (ret) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	sm3_ctx_mgr_init(mgr);
+
+	// Start OpenSSL tests
+	perf_start(&start);
+	for (t = 0; t < TEST_LOOPS; t++) {
+		for (i = 0; i < TEST_BUFS; i++)
+			sm3_ossl(bufs[i], TEST_LEN, digest_ssl[i]);
+	}
+	perf_stop(&stop);
+
+	printf("sm3_openssl" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+	// Start mb tests
+	perf_start(&start);
+	for (t = 0; t < TEST_LOOPS; t++) {
+		for (i = 0; i < TEST_BUFS; i++)
+			sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+		while (sm3_ctx_mgr_flush(mgr)) ;
+	}
+	perf_stop(&stop);
+
+	printf("multibinary_sm3" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_le32(((uint32_t *) digest_ssl[i])[j])) {
+				fail++;
+				printf("Test%d, digest%d fail %08X <=> %08X\n",
+				       i, j, ctxpool[i].job.result_digest[j],
+				       to_le32(((uint32_t *) digest_ssl[i])[j]));
+			}
+		}
+	}
+
+	printf("Multi-buffer sm3 test complete %d buffers of %d B with "
+	       "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sm3_ossl_perf: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_shortage_perf.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_shortage_perf.c
new file mode 100644
index 000000000..025fd90ed
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_shortage_perf.c
@@ -0,0 +1,133 @@
+/**********************************************************************
+  Copyright(c) 2020 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS SM3_MAX_LANES
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+#  define TEST_LEN     4*1024
+#  define TEST_LOOPS   10000
+#  define TEST_TYPE_STR "_warm"
+#else
+// Uncached test.  Pull from large mem base.
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     (GT_L3_CACHE / TEST_BUFS)
+#  define TEST_LOOPS   100
+#  define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest);
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SM3_DIGEST_NWORDS];
+
+int main(void)
+{
+	SM3_HASH_CTX_MGR *mgr = NULL;
+	SM3_HASH_CTX ctxpool[TEST_BUFS];
+	unsigned char *bufs[TEST_BUFS];
+	uint32_t i, j, t, fail = 0;
+	uint32_t nlanes;
+	struct perf start, stop;
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+		if (bufs[i] == NULL) {
+			printf("calloc failed test aborted\n");
+			return 1;
+		}
+		// Init ctx contents
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	int ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+	if (ret) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	sm3_ctx_mgr_init(mgr);
+
+	// Start OpenSSL tests
+	perf_start(&start);
+	for (t = 0; t < TEST_LOOPS; t++) {
+		for (i = 0; i < TEST_BUFS; i++)
+			sm3_ossl(bufs[i], TEST_LEN, digest_ssl[i]);
+	}
+	perf_stop(&stop);
+
+	printf("sm3_openssl" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+	// Start mb shortage tests
+	for (nlanes = TEST_BUFS; nlanes > 0; nlanes--) {
+		perf_start(&start);
+		for (t = 0; t < TEST_LOOPS; t++) {
+			for (i = 0; i < nlanes; i++)
+				sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN,
+						   HASH_ENTIRE);
+
+			while (sm3_ctx_mgr_flush(mgr)) ;
+		}
+		perf_stop(&stop);
+
+		printf("multibinary_sm3" TEST_TYPE_STR " with %d lanes: ", nlanes);
+		perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+		for (i = 0; i < nlanes; i++) {
+			for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+				if (ctxpool[i].job.result_digest[j] !=
+				    to_le32(((uint32_t *) digest_ssl[i])[j])) {
+					fail++;
+					printf("Test%d, digest%d fail %08X <=> %08X\n",
+					       i, j, ctxpool[i].job.result_digest[j],
+					       to_le32(((uint32_t *) digest_ssl[i])[j]));
+				}
+			}
+		}
+	}
+
+	printf("Multi-buffer sm3 test complete %d buffers of %d B with "
+	       "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+	if (fail)
+		printf("Test failed function check %d\n", fail);
+	else
+		printf(" multibinary_sm3_ossl_perf: Pass\n");
+
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm
new file mode 100644
index 000000000..3b300fa80
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm
@@ -0,0 +1,1035 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sm3_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+; Define Stack Layout
+START_FIELDS
+;;;     name            size    align
+FIELD	_DIGEST_SAVE,	8*64,	64
+FIELD	_rsp,		8,	8
+%assign STACK_SPACE	_FIELD_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+   %define arg1 rcx	; arg0 preserved
+   %define arg2 rdx	; arg1
+   %define reg3 r8	; arg2 preserved
+   %define reg4 r9	; arg3
+   %define var1 rdi
+   %define var2 rsi
+   %define local_func_decl(func_name) global func_name
+ %else
+   %define arg1 rdi	; arg0
+   %define arg2 rsi	; arg1
+   %define var1 rdx	; arg2
+   %define var2 rcx	; arg3
+   %define local_func_decl(func_name) mk_global func_name, function, internal
+%endif
+
+%define state    arg1
+%define num_blks arg2
+
+%define	IN	(state + _data_ptr) ; rdi + 8*16
+%define DIGEST	state		    ; rdi
+%define SIZE	num_blks	    ; rsi
+
+%define IDX  var1
+%define TBL  var2
+
+%define APPEND(a,b) a %+ b
+
+
+%define A	zmm0
+%define B	zmm1
+%define C	zmm2
+%define D	zmm3
+%define E	zmm4
+%define F	zmm5
+%define G	zmm6
+%define H	zmm7
+
+;
+; 4 ZMM for tmp data
+;
+%define TMP0	zmm8
+%define TMP1	zmm9
+%define TMP2	zmm10
+%define TMP3	zmm11
+
+;
+; Word W[] will be expand to array size 64
+; Word WB[] will be expand to array size 68
+; WB[j] :
+; 	tmp = WB[j - 16] ^ WB[j - 9] ^ rol32(WB[j - 3], 15);
+;	WB[j] = P1(tmp) ^ (rol32(WB[j - 13], 7)) ^ WB[j - 6];
+; W[j]:
+; 	W[j] = WB[j] xor WB[j+4]
+;
+; so we used zmm12~31 20 numbers ZMM to keep WB
+; it is because once we calc W[j] value, we need
+; WB[j - 16] to WB[j + 4] , it is 20 WB number.
+;
+; And also we keep the lane into ZMM12~ZMM27
+; once we calc WB value, lane will not work
+;
+%define WB0	zmm12
+%define WB1	zmm13
+%define WB2	zmm14
+%define WB3	zmm15
+%define WB4	zmm16
+%define WB5	zmm17
+%define WB6	zmm18
+%define WB7	zmm19
+
+%define WB8	zmm20
+%define WB9	zmm21
+%define WB10	zmm22
+%define WB11	zmm23
+%define WB12	zmm24
+%define WB13	zmm25
+%define WB14	zmm26
+%define WB15	zmm27
+
+%define WB16	zmm28
+%define WB17	zmm29
+%define WB18	zmm30
+%define WB19	zmm31
+
+
+%define inp0	r9
+%define inp1	r10
+%define inp2	r11
+%define inp3	r12
+%define inp4	r13
+%define inp5	r14
+%define inp6	r15
+%define inp7	rax
+
+;
+; same as sha256
+;
+%macro TRANSPOSE16 18
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%t0 %17
+%define %%t1 %18
+
+	; process top half (r0..r3) {a...d}
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {b13 b12 a13 a12   b9  b8  a9  a8   b5 b4 a5 a4   b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {b15 b14 a15 a14   b11 b10 a11 a10  b7 b6 a7 a6   b3 b2 a3 a2}
+	vshufps	%%t1, %%r2, %%r3, 0x44	; t1 = {d13 d12 c13 c12   d9  d8  c9  c8   d5 d4 c5 c4   d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {d15 d14 c15 c14   d11 d10 c11 c10  d7 d6 c7 c6   d3 d2 c3 c2}
+
+	vshufps	%%r3, %%t0, %%t1, 0xDD	; r3 = {d13 c13 b13 a13   d9  c9  b9  a9   d5 c5 b5 a5   d1 c1 b1 a1}
+	vshufps	%%r1, %%r0, %%r2, 0x88	; r1 = {d14 c14 b14 a14   d10 c10 b10 a10  d6 c6 b6 a6   d2 c2 b2 a2}
+	vshufps	%%r0, %%r0, %%r2, 0xDD	; r0 = {d15 c15 b15 a15   d11 c11 b11 a11  d7 c7 b7 a7   d3 c3 b3 a3}
+	vshufps	%%t0, %%t0, %%t1, 0x88	; t0 = {d12 c12 b12 a12   d8  c8  b8  a8   d4 c4 b4 a4   d0 c0 b0 a0}
+
+	; use r2 in place of t0
+	vshufps	%%r2, %%r4, %%r5, 0x44	; r2 = {f13 f12 e13 e12   f9  f8  e9  e8   f5 f4 e5 e4   f1 f0 e1 e0}
+	vshufps	%%r4, %%r4, %%r5, 0xEE	; r4 = {f15 f14 e15 e14   f11 f10 e11 e10  f7 f6 e7 e6   f3 f2 e3 e2}
+	vshufps %%t1, %%r6, %%r7, 0x44	; t1 = {h13 h12 g13 g12   h9  h8  g9  g8   h5 h4 g5 g4   h1 h0 g1 g0}
+	vshufps	%%r6, %%r6, %%r7, 0xEE	; r6 = {h15 h14 g15 g14   h11 h10 g11 g10  h7 h6 g7 g6   h3 h2 g3 g2}
+
+	vshufps	%%r7, %%r2, %%t1, 0xDD	; r7 = {h13 g13 f13 e13   h9  g9  f9  e9   h5 g5 f5 e5   h1 g1 f1 e1}
+	vshufps	%%r5, %%r4, %%r6, 0x88	; r5 = {h14 g14 f14 e14   h10 g10 f10 e10  h6 g6 f6 e6   h2 g2 f2 e2}
+	vshufps	%%r4, %%r4, %%r6, 0xDD	; r4 = {h15 g15 f15 e15   h11 g11 f11 e11  h7 g7 f7 e7   h3 g3 f3 e3}
+	vshufps	%%r2, %%r2, %%t1, 0x88	; r2 = {h12 g12 f12 e12   h8  g8  f8  e8   h4 g4 f4 e4   h0 g0 f0 e0}
+
+	; use r6 in place of t0
+	vshufps	%%r6, %%r8, %%r9,    0x44	; r6  = {j13 j12 i13 i12   j9  j8  i9  i8   j5 j4 i5 i4   j1 j0 i1 i0}
+	vshufps	%%r8, %%r8, %%r9,    0xEE	; r8  = {j15 j14 i15 i14   j11 j10 i11 i10  j7 j6 i7 i6   j3 j2 i3 i2}
+	vshufps	%%t1, %%r10, %%r11,  0x44	; t1  = {l13 l12 k13 k12   l9  l8  k9  k8   l5 l4 k5 k4   l1 l0 k1 k0}
+	vshufps	%%r10, %%r10, %%r11, 0xEE	; r10 = {l15 l14 k15 k14   l11 l10 k11 k10  l7 l6 k7 k6   l3 l2 k3 k2}
+
+	vshufps	%%r11, %%r6, %%t1, 0xDD		; r11 = {l13 k13 j13 113   l9  k9  j9  i9   l5 k5 j5 i5   l1 k1 j1 i1}
+	vshufps	%%r9, %%r8, %%r10, 0x88		; r9  = {l14 k14 j14 114   l10 k10 j10 i10  l6 k6 j6 i6   l2 k2 j2 i2}
+	vshufps	%%r8, %%r8, %%r10, 0xDD		; r8  = {l15 k15 j15 115   l11 k11 j11 i11  l7 k7 j7 i7   l3 k3 j3 i3}
+	vshufps	%%r6, %%r6, %%t1,  0x88		; r6  = {l12 k12 j12 112   l8  k8  j8  i8   l4 k4 j4 i4   l0 k0 j0 i0}
+
+	; use r10 in place of t0
+	vshufps	%%r10, %%r12, %%r13, 0x44	; r10 = {n13 n12 m13 m12   n9  n8  m9  m8   n5 n4 m5 m4   n1 n0 a1 m0}
+	vshufps	%%r12, %%r12, %%r13, 0xEE	; r12 = {n15 n14 m15 m14   n11 n10 m11 m10  n7 n6 m7 m6   n3 n2 a3 m2}
+	vshufps	%%t1, %%r14, %%r15,  0x44	; t1  = {p13 p12 013 012   p9  p8  09  08   p5 p4 05 04   p1 p0 01 00}
+	vshufps	%%r14, %%r14, %%r15, 0xEE	; r14 = {p15 p14 015 014   p11 p10 011 010  p7 p6 07 06   p3 p2 03 02}
+
+	vshufps	%%r15, %%r10, %%t1,  0xDD	; r15 = {p13 013 n13 m13   p9  09  n9  m9   p5 05 n5 m5   p1 01 n1 m1}
+	vshufps	%%r13, %%r12, %%r14, 0x88	; r13 = {p14 014 n14 m14   p10 010 n10 m10  p6 06 n6 m6   p2 02 n2 m2}
+	vshufps	%%r12, %%r12, %%r14, 0xDD	; r12 = {p15 015 n15 m15   p11 011 n11 m11  p7 07 n7 m7   p3 03 n3 m3}
+	vshufps	%%r10, %%r10, %%t1,  0x88	; r10 = {p12 012 n12 m12   p8  08  n8  m8   p4 04 n4 m4   p0 00 n0 m0}
+
+	vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r14, %%t0, %%r2		; r14 = {h8  g8  f8  e8   d8  c8  b8  a8   h0 g0 f0 e0	 d0 c0 b0 a0}
+	vmovdqa32 %%t1,  [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%t1,  %%t0, %%r2		; t1  = {h12 g12 f12 e12  d12 c12 b12 a12  h4 g4 f4 e4	 d4 c4 b4 a4}
+
+	vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r2, %%r3, %%r7		; r2  = {h9  g9  f9  e9   d9  c9  b9  a9   h1 g1 f1 e1	 d1 c1 b1 a1}
+	vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%t0, %%r3, %%r7		; t0  = {h13 g13 f13 e13  d13 c13 b13 a13  h5 g5 f5 e5	 d5 c5 b5 a5}
+
+	vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r3, %%r1, %%r5		; r3  = {h10 g10 f10 e10  d10 c10 b10 a10  h2 g2 f2 e2	 d2 c2 b2 a2}
+	vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r7, %%r1, %%r5		; r7  = {h14 g14 f14 e14  d14 c14 b14 a14  h6 g6 f6 e6	 d6 c6 b6 a6}
+
+	vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r1, %%r0, %%r4		; r1  = {h11 g11 f11 e11  d11 c11 b11 a11  h3 g3 f3 e3	 d3 c3 b3 a3}
+	vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r5, %%r0, %%r4		; r5  = {h15 g15 f15 e15  d15 c15 b15 a15  h7 g7 f7 e7	 d7 c7 b7 a7}
+
+	vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r0, %%r6, %%r10		; r0 = {p8  o8  n8  m8   l8  k8  j8  i8   p0 o0 n0 m0	 l0 k0 j0 i0}
+	vmovdqa32 %%r4,  [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r4, %%r6, %%r10		; r4  = {p12 o12 n12 m12  l12 k12 j12 i12  p4 o4 n4 m4	 l4 k4 j4 i4}
+
+	vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r6, %%r11, %%r15		; r6  = {p9  o9  n9  m9   l9  k9  j9  i9   p1 o1 n1 m1	 l1 k1 j1 i1}
+	vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r10, %%r11, %%r15		; r10 = {p13 o13 n13 m13  l13 k13 j13 i13  p5 o5 n5 m5	 l5 k5 j5 i5}
+
+	vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r11, %%r9, %%r13		; r11 = {p10 o10 n10 m10  l10 k10 j10 i10  p2 o2 n2 m2	 l2 k2 j2 i2}
+	vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r15, %%r9, %%r13		; r15 = {p14 o14 n14 m14  l14 k14 j14 i14  p6 o6 n6 m6	 l6 k6 j6 i6}
+
+	vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1]
+	vpermi2q  %%r9, %%r8, %%r12		; r9  = {p11 o11 n11 m11  l11 k11 j11 i11  p3 o3 n3 m3	 l3 k3 j3 i3}
+	vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2]
+	vpermi2q  %%r13, %%r8, %%r12		; r13 = {p15 o15 n15 m15  l15 k15 j15 i15  p7 o7 n7 m7	 l7 k7 j7 i7}
+
+	;; At this point r8 and r12 can be used as scratch registers
+
+	vshuff64x2 %%r8, %%r14, %%r0, 0xEE 	; r8  = {p8  o8  n8  m8   l8  k8  j8  i8   h8 g8 f8 e8   d8 c8 b8 a8}
+	vshuff64x2 %%r0, %%r14, %%r0, 0x44 	; r0  = {p0  o0  n0  m0   l0  k0  j0  i0   h0 g0 f0 e0   d0 c0 b0 a0}
+
+	vshuff64x2 %%r12, %%t1, %%r4, 0xEE 	; r12 = {p12 o12 n12 m12  l12 k12 j12 i12  h12 g12 f12 e12  d12 c12 b12 a12}
+	vshuff64x2 %%r4, %%t1, %%r4, 0x44 	; r4  = {p4  o4  n4  m4   l4  k4  j4  i4   h4 g4 f4 e4   d4 c4 b4 a4}
+
+	vshuff64x2 %%r14, %%r7, %%r15, 0xEE 	; r14 = {p14 o14 n14 m14  l14 k14 j14 i14  h14 g14 f14 e14  d14 c14 b14 a14}
+	vshuff64x2 %%t1, %%r7, %%r15, 0x44 	; t1  = {p6  o6  n6  m6   l6  k6  j6  i6   h6 g6 f6 e6   d6 c6 b6 a6}
+
+	vshuff64x2 %%r15, %%r5, %%r13, 0xEE 	; r15 = {p15 o15 n15 m15  l15 k15 j15 i15  h15 g15 f15 e15  d15 c15 b15 a15}
+	vshuff64x2 %%r7, %%r5, %%r13, 0x44 	; r7  = {p7  o7  n7  m7   l7  k7  j7  i7   h7 g7 f7 e7   d7 c7 b7 a7}
+
+	vshuff64x2 %%r13, %%t0, %%r10, 0xEE 	; r13 = {p13 o13 n13 m13  l13 k13 j13 i13  h13 g13 f13 e13  d13 c13 b13 a13}
+	vshuff64x2 %%r5, %%t0, %%r10, 0x44 	; r5  = {p5  o5  n5  m5   l5  k5  j5  i5   h5 g5 f5 e5   d5 c5 b5 a5}
+
+	vshuff64x2 %%r10, %%r3, %%r11, 0xEE 	; r10 = {p10 o10 n10 m10  l10 k10 j10 i10  h10 g10 f10 e10  d10 c10 b10 a10}
+	vshuff64x2 %%t0, %%r3, %%r11, 0x44 	; t0  = {p2  o2  n2  m2   l2  k2  j2  i2   h2 g2 f2 e2   d2 c2 b2 a2}
+
+	vshuff64x2 %%r11, %%r1, %%r9, 0xEE 	; r11 = {p11 o11 n11 m11  l11 k11 j11 i11  h11 g11 f11 e11  d11 c11 b11 a11}
+	vshuff64x2 %%r3, %%r1, %%r9, 0x44 	; r3  = {p3  o3  n3  m3   l3  k3  j3  i3   h3 g3 f3 e3   d3 c3 b3 a3}
+
+	vshuff64x2 %%r9, %%r2, %%r6, 0xEE 	; r9  = {p9  o9  n9  m9   l9  k9  j9  i9   h9 g9 f9 e9   d9 c9 b9 a9}
+	vshuff64x2 %%r1, %%r2, %%r6, 0x44 	; r1  = {p1  o1  n1  m1   l1  k1  j1  i1   h1 g1 f1 e1   d1 c1 b1 a1}
+
+	vmovdqa32 %%r2, %%t0			; r2  = {p2  o2  n2  m2   l2  k2  j2  i2   h2 g2 f2 e2   d2 c2 b2 a2}
+	vmovdqa32 %%r6, %%t1			; r6  = {p6  o6  n6  m6   l6  k6  j6  i6   h6 g6 f6 e6   d6 c6 b6 a6}
+
+%endmacro
+
+
+%macro ROTATE_ARGS 0
+	%xdefine TMP_ D
+	%xdefine D C
+	%xdefine C B
+	%xdefine B A
+	%xdefine A TMP3
+	%xdefine TMP3 TMP_
+
+	%xdefine TMP2_ H
+	%xdefine H G
+	%xdefine G F
+	%xdefine F E
+	%xdefine E TMP0
+	%xdefine TMP0 TMP2_
+%endmacro
+
+;
+; P() Save in TMP0
+; used TMP1
+%macro P 1
+%define %%A     %1
+        vprold TMP0,%%A,9
+        vprold TMP1,%%A,17
+
+        vpternlogd TMP0,TMP1,%%A,0x96
+
+%endmacro
+
+;
+; P1() Save in TMP0
+; used TMP1
+%macro P1 1
+%define %%A 	%1
+
+	vprold TMP0,%%A,15
+	vprold TMP1,%%A,23
+
+	vpternlogd TMP0,TMP1,%%A,0x96
+%endmacro
+
+;
+; FF_16() Save in TMP0
+;
+%macro FF_16 3
+%define %%X 	%1
+%define %%Y 	%2
+%define %%Z 	%3
+	; I < 16 return (X ^ Y ^ Z)
+	vmovups TMP0,%%X
+	vpternlogd TMP0,%%Y,%%Z,0x96
+%endmacro
+
+
+;
+; FF_64() Save in TMP0
+; used TMP1
+%macro FF_64 3
+
+%define %%X 	%1
+%define %%Y 	%2
+%define %%Z 	%3
+	; I > 16 return (x & y) | (x & z) | (y & z)
+	; Same as (x & y) | (z & (x | y))
+	vporq TMP0,%%X,%%Y
+	vpandq TMP0,%%Z
+	vpandq TMP1,%%X,%%Y
+	vporq TMP0,TMP1
+%endmacro
+
+
+;
+; GG() Save in TMP0
+; used TMP1
+%macro GG_16 3
+%define %%X 	%1
+%define %%Y 	%2
+%define %%Z 	%3
+	; I < 16 return (x ^ y ^ z)
+        vmovups TMP0,%%X
+        vpternlogd TMP0,%%Y,%%Z,0x96
+%endmacro
+
+%macro GG_64 3
+
+%define %%X 	%1
+%define %%Y 	%2
+%define %%Z 	%3
+
+	; I > 16 return (x & y) | ((~x) & z)
+	vpandq TMP0,%%X,%%Y
+	vpandnd TMP1,%%X,%%Z
+	vporq TMP0,TMP1
+%endmacro
+
+;; void sm3_mb_x16_avx512(SM3_MB_ARGS_X16, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+local_func_decl(sm3_mb_x16_avx512)
+sm3_mb_x16_avx512:
+	endbranch
+
+	mov	rax, rsp
+        sub     rsp, STACK_SPACE
+	and	rsp, ~63	; align stack to multiple of 64
+	mov	[rsp + _rsp], rax
+
+	lea	TBL, [TABLE]
+
+	;; Initialize digests
+	vmovups	A, [DIGEST + 0*64] ; mov unsigned
+	vmovups	B, [DIGEST + 1*64]
+	vmovups	C, [DIGEST + 2*64]
+	vmovups	D, [DIGEST + 3*64]
+	vmovups	E, [DIGEST + 4*64]
+	vmovups	F, [DIGEST + 5*64]
+	vmovups	G, [DIGEST + 6*64]
+	vmovups	H, [DIGEST + 7*64]
+
+	xor IDX, IDX
+
+%assign cur_loop 0
+lloop:
+	;; start message expand
+	;; Transpose input data
+	mov	inp0, [IN + 0*8]
+	mov	inp1, [IN + 1*8]
+	mov	inp2, [IN + 2*8]
+	mov	inp3, [IN + 3*8]
+	mov	inp4, [IN + 4*8]
+	mov	inp5, [IN + 5*8]
+	mov	inp6, [IN + 6*8]
+	mov	inp7, [IN + 7*8]
+
+	;; stored B(i) to W(1)...W(15)
+	;; in zmm16....zmm31
+
+	vmovups	WB0,[inp0+IDX]
+	vmovups	WB1,[inp1+IDX]
+	vmovups	WB2,[inp2+IDX]
+	vmovups	WB3,[inp3+IDX]
+	vmovups	WB4,[inp4+IDX]
+	vmovups	WB5,[inp5+IDX]
+	vmovups	WB6,[inp6+IDX]
+	vmovups	WB7,[inp7+IDX]
+
+	mov	inp0, [IN + 8*8]
+	mov	inp1, [IN + 9*8]
+	mov	inp2, [IN +10*8]
+	mov	inp3, [IN +11*8]
+	mov	inp4, [IN +12*8]
+	mov	inp5, [IN +13*8]
+	mov	inp6, [IN +14*8]
+	mov	inp7, [IN +15*8]
+
+	vmovups	WB8, [inp0+IDX]
+	vmovups	WB9, [inp1+IDX]
+	vmovups	WB10,[inp2+IDX]
+	vmovups	WB11,[inp3+IDX]
+	vmovups	WB12,[inp4+IDX]
+	vmovups	WB13,[inp5+IDX]
+	vmovups	WB14,[inp6+IDX]
+	vmovups	WB15,[inp7+IDX]
+
+	vmovdqa32	[rsp + _DIGEST_SAVE + 64*0], A
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*1], B
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*2], C
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*3], D
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*4], E
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*5], F
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*6], G
+        vmovdqa32	[rsp + _DIGEST_SAVE + 64*7], H
+
+	add	IDX, 64
+
+	; flat shuffle
+	 TRANSPOSE16 WB0, WB1, WB2, WB3, WB4, WB5, WB6, WB7, WB8, WB9, WB10, WB11, WB12, WB13, WB14, WB15, TMP0, TMP1
+
+	; little endian to big endian
+	vmovdqa32 TMP0, [SHUF_MASK]
+	vpshufb WB0,TMP0
+	vpshufb WB1,TMP0
+	vpshufb WB2,TMP0
+	vpshufb WB3,TMP0
+	vpshufb WB4,TMP0
+	vpshufb WB5,TMP0
+	vpshufb WB6,TMP0
+	vpshufb WB7,TMP0
+	vpshufb WB8,TMP0
+	vpshufb WB9,TMP0
+	vpshufb WB10,TMP0
+	vpshufb WB11,TMP0
+	vpshufb WB12,TMP0
+	vpshufb WB13,TMP0
+	vpshufb WB14,TMP0
+	vpshufb WB15,TMP0
+
+%assign I 0
+%rep 12
+	%assign J I+4
+
+	; (A <<< 12)
+	; store in TMP0
+	vprold TMP0,A,12
+
+	; SS1 = ((A <<< 12) + E + (T(j) <<< j)) <<< 7
+	; (T(j) <<< j) store in TBL
+	; SS1 store in TMP2
+	vmovdqa32 TMP2, [TBL + (I*64)]
+	vpaddd TMP2,E
+
+	vpaddd TMP2,TMP0
+	vprold TMP2,7
+
+	; SS2 = SS1 ^ (A <<< 12)
+	; SS2 store in TMP3
+	vpxord TMP3,TMP2,TMP0
+
+	; TT2 = GG(E,F,G) + H + SS1 + WB(I)
+	GG_16 E,F,G
+	vpaddd TMP2,TMP0
+	vpaddd TMP2,H
+
+	vpaddd TMP2,APPEND(WB,I)
+
+	; TT1 = FF(A,B,C) + D + SS2 + W(I)
+	; TT1 store in TMP3
+	FF_16 A,B,C
+	vpaddd TMP3,TMP0
+	vpaddd TMP3,D
+	; W(I) = WB(I) ^ W(I+4)
+	vpxord TMP0,APPEND(WB,I),APPEND(WB,J)
+	vpaddd TMP3,TMP0
+
+
+	; D = C
+	; C = B <<< 9
+	; B = A
+	; A = TT1
+	; H = G
+	; G = F <<< 19
+	; F = E
+	; E = P(TT2)
+	vmovups D,C
+	vprold B,9
+	vmovups C,B
+	vmovups B,A
+	vmovups A,TMP3
+	vmovups H,G
+	vprold F,19
+	vmovups G,F
+	vmovups F,E
+	P TMP2
+	vmovups E,TMP0
+
+	;vprold B,9
+	;vprold F,19
+	;P TMP2
+	;ROTATE_ARGS
+
+	%assign I (I+1)
+%endrep
+
+
+;tmp = WB[j - 16] ^ WB[j - 9] ^ rol32(WB[j - 3], 15);
+;WB[j] = P1(tmp) ^ (rol32(WB[j - 13], 7)) ^ WB[j - 6];
+
+; round 12-16 here
+%rep 4
+	%assign J I+4
+
+	%assign J_3 J-3
+	%assign J_16 J-16
+	%assign J_9 J-9
+	%assign J_13 J-13
+	%assign J_6 J-6
+
+	; clac WB(I+4)
+	vprold APPEND(WB,J),APPEND(WB,J_3),15
+	vpxord APPEND(WB,J),APPEND(WB,J_16)
+	vpxord APPEND(WB,J),APPEND(WB,J_9)
+
+	P1 APPEND(WB,J)
+
+	vprold APPEND(WB,J),APPEND(WB,J_13),7
+	vpxord APPEND(WB,J),TMP0
+	vpxord APPEND(WB,J),APPEND(WB,J_6)
+
+	; (A <<< 12)
+	; store in TMP0
+	vprold TMP0,A,12
+
+	; SS1 = ((A <<< 12) + E + (T(j) <<< j)) <<< 7
+	; (T(j) <<< j) store in TBL
+	; SS1 store in TMP2
+	vmovdqa32 TMP2, [TBL + (I*64)]
+	vpaddd TMP2,E
+
+	vpaddd TMP2,TMP0
+	vprold TMP2,7
+
+	; SS2 = SS1 ^ (A <<< 12)
+	; SS2 store in TMP3
+	vpxord TMP3,TMP2,TMP0
+
+	; TT2 = GG(E,F,G) + H + SS1 + WB(I)
+	GG_16 E,F,G
+	vpaddd TMP2,TMP0
+	vpaddd TMP2,H
+
+	vpaddd TMP2,APPEND(WB,I)
+
+	; TT1 = FF(A,B,C) + D + SS2 + W(I)
+	; TT1 store in TMP3
+	FF_16 A,B,C
+	vpaddd TMP3,TMP0
+	vpaddd TMP3,D
+	; W(I) = WB(I) ^ W(I+4)
+	vpxord TMP0,APPEND(WB,I),APPEND(WB,J)
+	vpaddd TMP3,TMP0
+
+	; D = C
+	; C = B <<< 9
+	; B = A
+	; A = TT1
+	; H = G
+	; G = F <<< 19
+	; F = E
+	; E = P(TT2)
+	vmovups D,C
+	vprold B,9
+	vmovups C,B
+	vmovups B,A
+	vmovups A,TMP3
+	vmovups H,G
+	vprold F,19
+	vmovups G,F
+	vmovups F,E
+	P TMP2
+	vmovups E,TMP0
+
+	%assign I (I+1)
+%endrep
+
+%rep 48
+	%assign J (((I+4) % 20) + 20)
+
+	%assign J_3 ((J-3) % 20)
+	%assign J_16 ((J-16) % 20)
+	%assign J_9 ((J-9) % 20)
+	%assign J_13 ((J-13) % 20)
+	%assign J_6 ((J-6) % 20)
+
+	%assign I_20 (I % 20)
+	%assign J (((I+4) % 20))
+
+	vprold APPEND(WB,J),APPEND(WB,J_3),15
+	vpxord APPEND(WB,J),APPEND(WB,J_16)
+	vpxord APPEND(WB,J),APPEND(WB,J_9)
+
+	P1 APPEND(WB,J)
+
+	vprold APPEND(WB,J),APPEND(WB,J_13),7
+	vpxord APPEND(WB,J),TMP0
+	vpxord APPEND(WB,J),APPEND(WB,J_6)
+
+	; (A <<< 12)
+	; store in TMP0
+	vprold TMP0,A,12
+
+	; SS1 = ((A <<< 12) + E + (T(j) <<< j)) <<< 7
+	; (T(j) <<< j) store in TBL
+	; SS1 store in TMP2
+	vmovdqa32 TMP2, [TBL + (I*64)]
+	vpaddd TMP2,E
+
+	vpaddd TMP2,TMP0
+	vprold TMP2,7
+
+	; SS2 = SS1 ^ (A <<< 12)
+	; SS2 store in TMP3
+	vpxord TMP3,TMP2,TMP0
+
+	; TT2 = GG(E,F,G) + H + SS1 + WB(I)
+	GG_64 E,F,G
+	vpaddd TMP2,TMP0
+	vpaddd TMP2,H
+
+	vpaddd TMP2,APPEND(WB,I_20)
+
+	; TT1 = FF(A,B,C) + D + SS2 + W(I)
+	; TT1 store in TMP3
+	FF_64 A,B,C
+	vpaddd TMP3,TMP0
+	vpaddd TMP3,D
+	; W(I) = WB(I) ^ W(I+4)
+	vpxord TMP0,APPEND(WB,I_20),APPEND(WB,J)
+	vpaddd TMP3,TMP0
+
+	; D = C
+	; C = B <<< 9
+	; B = A
+	; A = TT1
+	; H = G
+	; G = F <<< 19
+	; F = E
+	; E = P(TT2)
+	vmovups D,C
+	vprold B,9
+	vmovups C,B
+	vmovups B,A
+	vmovups A,TMP3
+	vmovups H,G
+	vprold F,19
+	vmovups G,F
+	vmovups F,E
+	P TMP2
+	vmovups E,TMP0
+
+	%assign I (I+1)
+%endrep
+	; Xor old digest
+        vpxord		A, A, [rsp + _DIGEST_SAVE + 64*0]
+        vpxord		B, B, [rsp + _DIGEST_SAVE + 64*1]
+        vpxord		C, C, [rsp + _DIGEST_SAVE + 64*2]
+        vpxord		D, D, [rsp + _DIGEST_SAVE + 64*3]
+        vpxord		E, E, [rsp + _DIGEST_SAVE + 64*4]
+        vpxord		F, F, [rsp + _DIGEST_SAVE + 64*5]
+        vpxord		G, G, [rsp + _DIGEST_SAVE + 64*6]
+        vpxord		H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+	%assign cur_loop cur_loop+1
+	sub 	SIZE, 1
+	je	last_loop
+
+	jmp	lloop
+
+
+last_loop:
+
+%assign I 0
+%rep 8
+        mov    inp0, [IN + (2*I)*8]
+        mov    inp1, [IN + (2*I +1)*8]
+        add    inp0, IDX
+        add    inp1, IDX
+        mov    [IN + (2*I)*8], inp0
+        mov    [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+	; Write out digest
+	vmovups	[DIGEST + 0*64], A
+	vmovups	[DIGEST + 1*64], B
+	vmovups	[DIGEST + 2*64], C
+	vmovups	[DIGEST + 3*64], D
+	vmovups	[DIGEST + 4*64], E
+	vmovups	[DIGEST + 5*64], F
+	vmovups	[DIGEST + 6*64], G
+	vmovups	[DIGEST + 7*64], H
+
+
+        mov     rsp, [rsp + _rsp]
+        ret
+
+
+section .data
+align 64
+TABLE:
+	dq 0x79cc451979cc4519,0x79cc451979cc4519
+	dq 0x79cc451979cc4519,0x79cc451979cc4519
+	dq 0x79cc451979cc4519,0x79cc451979cc4519
+	dq 0x79cc451979cc4519,0x79cc451979cc4519
+	dq 0xf3988a32f3988a32,0xf3988a32f3988a32
+	dq 0xf3988a32f3988a32,0xf3988a32f3988a32
+	dq 0xf3988a32f3988a32,0xf3988a32f3988a32
+	dq 0xf3988a32f3988a32,0xf3988a32f3988a32
+	dq 0xe7311465e7311465,0xe7311465e7311465
+	dq 0xe7311465e7311465,0xe7311465e7311465
+	dq 0xe7311465e7311465,0xe7311465e7311465
+	dq 0xe7311465e7311465,0xe7311465e7311465
+	dq 0xce6228cbce6228cb,0xce6228cbce6228cb
+	dq 0xce6228cbce6228cb,0xce6228cbce6228cb
+	dq 0xce6228cbce6228cb,0xce6228cbce6228cb
+	dq 0xce6228cbce6228cb,0xce6228cbce6228cb
+	dq 0x9cc451979cc45197,0x9cc451979cc45197
+	dq 0x9cc451979cc45197,0x9cc451979cc45197
+	dq 0x9cc451979cc45197,0x9cc451979cc45197
+	dq 0x9cc451979cc45197,0x9cc451979cc45197
+	dq 0x3988a32f3988a32f,0x3988a32f3988a32f
+	dq 0x3988a32f3988a32f,0x3988a32f3988a32f
+	dq 0x3988a32f3988a32f,0x3988a32f3988a32f
+	dq 0x3988a32f3988a32f,0x3988a32f3988a32f
+	dq 0x7311465e7311465e,0x7311465e7311465e
+	dq 0x7311465e7311465e,0x7311465e7311465e
+	dq 0x7311465e7311465e,0x7311465e7311465e
+	dq 0x7311465e7311465e,0x7311465e7311465e
+	dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
+	dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
+	dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
+	dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
+	dq 0xcc451979cc451979,0xcc451979cc451979
+	dq 0xcc451979cc451979,0xcc451979cc451979
+	dq 0xcc451979cc451979,0xcc451979cc451979
+	dq 0xcc451979cc451979,0xcc451979cc451979
+	dq 0x988a32f3988a32f3,0x988a32f3988a32f3
+	dq 0x988a32f3988a32f3,0x988a32f3988a32f3
+	dq 0x988a32f3988a32f3,0x988a32f3988a32f3
+	dq 0x988a32f3988a32f3,0x988a32f3988a32f3
+	dq 0x311465e7311465e7,0x311465e7311465e7
+	dq 0x311465e7311465e7,0x311465e7311465e7
+	dq 0x311465e7311465e7,0x311465e7311465e7
+	dq 0x311465e7311465e7,0x311465e7311465e7
+	dq 0x6228cbce6228cbce,0x6228cbce6228cbce
+	dq 0x6228cbce6228cbce,0x6228cbce6228cbce
+	dq 0x6228cbce6228cbce,0x6228cbce6228cbce
+	dq 0x6228cbce6228cbce,0x6228cbce6228cbce
+	dq 0xc451979cc451979c,0xc451979cc451979c
+	dq 0xc451979cc451979c,0xc451979cc451979c
+	dq 0xc451979cc451979c,0xc451979cc451979c
+	dq 0xc451979cc451979c,0xc451979cc451979c
+	dq 0x88a32f3988a32f39,0x88a32f3988a32f39
+	dq 0x88a32f3988a32f39,0x88a32f3988a32f39
+	dq 0x88a32f3988a32f39,0x88a32f3988a32f39
+	dq 0x88a32f3988a32f39,0x88a32f3988a32f39
+	dq 0x11465e7311465e73,0x11465e7311465e73
+	dq 0x11465e7311465e73,0x11465e7311465e73
+	dq 0x11465e7311465e73,0x11465e7311465e73
+	dq 0x11465e7311465e73,0x11465e7311465e73
+	dq 0x228cbce6228cbce6,0x228cbce6228cbce6
+	dq 0x228cbce6228cbce6,0x228cbce6228cbce6
+	dq 0x228cbce6228cbce6,0x228cbce6228cbce6
+	dq 0x228cbce6228cbce6,0x228cbce6228cbce6
+	dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+	dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+	dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+	dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+	dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+	dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+	dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+	dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+	dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+	dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+	dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+	dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+	dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+	dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+	dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+	dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+	dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+	dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+	dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+	dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+	dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+	dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+	dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+	dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+	dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+	dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+	dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+	dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+	dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+	dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+	dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+	dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+	dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+	dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+	dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+	dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+	dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+	dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+	dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+	dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+	dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+	dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+	dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+	dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+	dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+	dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+	dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+	dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+	dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+	dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+	dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+	dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+	dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+	dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+	dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+	dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+	dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+	dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+	dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+	dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+	dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+	dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+	dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+	dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+	dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
+	dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
+	dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
+	dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
+	dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
+	dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
+	dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
+	dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
+	dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
+	dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
+	dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
+	dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
+	dq 0xd43cec53d43cec53,0xd43cec53d43cec53
+	dq 0xd43cec53d43cec53,0xd43cec53d43cec53
+	dq 0xd43cec53d43cec53,0xd43cec53d43cec53
+	dq 0xd43cec53d43cec53,0xd43cec53d43cec53
+	dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
+	dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
+	dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
+	dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
+	dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
+	dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
+	dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
+	dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
+	dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
+	dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
+	dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
+	dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
+	dq 0x43cec53d43cec53d,0x43cec53d43cec53d
+	dq 0x43cec53d43cec53d,0x43cec53d43cec53d
+	dq 0x43cec53d43cec53d,0x43cec53d43cec53d
+	dq 0x43cec53d43cec53d,0x43cec53d43cec53d
+	dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
+	dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
+	dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
+	dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
+	dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
+	dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
+	dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
+	dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
+	dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
+	dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
+	dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
+	dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
+	dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
+	dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
+	dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
+	dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
+	dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
+	dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
+	dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
+	dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
+	dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
+	dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
+	dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
+	dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
+	dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
+	dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
+	dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
+	dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
+	dq 0xcec53d43cec53d43,0xcec53d43cec53d43
+	dq 0xcec53d43cec53d43,0xcec53d43cec53d43
+	dq 0xcec53d43cec53d43,0xcec53d43cec53d43
+	dq 0xcec53d43cec53d43,0xcec53d43cec53d43
+	dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+	dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+	dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+	dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+	dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+	dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+	dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+	dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+	dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+	dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+	dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+	dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+	dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+	dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+	dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+	dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+	dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+	dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+	dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+	dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+	dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+	dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+	dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+	dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+	dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+	dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+	dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+	dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+	dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+	dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+	dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+	dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+	dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+	dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+	dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+	dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+	dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+	dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+	dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+	dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+	dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+	dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+	dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+	dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+	dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+	dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+	dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+	dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+	dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+	dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+	dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+	dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+	dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+	dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+	dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+	dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+	dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+	dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+	dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+	dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+	dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+	dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+	dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+	dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+
+
+
+PSHUFFLE_TRANSPOSE16_MASK1: 	dq 0x0000000000000000
+				dq 0x0000000000000001
+				dq 0x0000000000000008
+				dq 0x0000000000000009
+				dq 0x0000000000000004
+				dq 0x0000000000000005
+				dq 0x000000000000000C
+				dq 0x000000000000000D
+
+PSHUFFLE_TRANSPOSE16_MASK2: 	dq 0x0000000000000002
+				dq 0x0000000000000003
+				dq 0x000000000000000A
+				dq 0x000000000000000B
+				dq 0x0000000000000006
+				dq 0x0000000000000007
+				dq 0x000000000000000E
+				dq 0x000000000000000F
+
+SHUF_MASK:			dq 0x0405060700010203,0x0c0d0e0f08090a0b
+				dq 0x0405060700010203,0x0c0d0e0f08090a0b
+				dq 0x0405060700010203,0x0c0d0e0f08090a0b
+				dq 0x0405060700010203,0x0c0d0e0f08090a0b
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sm3_mb_x16_avx512
+no_sm3_mb_x16_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm
new file mode 100644
index 000000000..0c2c9cdee
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm
@@ -0,0 +1,711 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sm3_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute oct SM3 using SSE-256 / AVX2
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rsi, rdi, r9-r15; eax;ymm0-15
+;; Windows clobbers:  rax rdx rsi rdi        r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves:         rcx             rbp r8
+;;
+;; Linux clobbers:    rax rcx rdx rsi            r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves:           rdi rbp r8
+;;
+;; clobbers ymm0-15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux definitions
+     %define arg1 	rdi
+     %define arg2	rsi
+     %define reg3	rcx
+     %define reg4	rdx
+%else
+ ; Windows definitions
+     %define arg1 	rcx
+     %define arg2 	rdx
+     %define reg3	rsi
+     %define reg4	rdi
+%endif
+
+; Common definitions
+%define STATE    arg1
+%define INP_SIZE arg2
+%define SIZE	 INP_SIZE ; rsi
+
+%define IDX     rax
+%define TBL	reg3
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 reg4
+
+%define APPEND(a,b) a %+ b
+
+%define WB0 ymm0
+%define WB1 ymm1
+%define WB2 ymm2
+%define WB3 ymm3
+%define WB4 ymm4
+%define WB5 ymm5
+%define WB6 ymm6
+%define WB7 ymm7
+%define WB8 ymm8
+%define WB9 ymm9
+%define WB10 ymm10
+%define WB11 ymm11
+%define WB12 ymm12
+%define WB13 ymm13
+%define WB14 ymm14
+%define WB15 ymm15
+
+%define WBTMP0 ymm8
+%define WBTMP1 ymm9
+
+%define WBTMP2 ymm0
+%define WBTMP3 ymm1
+
+%define A	ymm0
+%define B	ymm1
+%define C	ymm2
+%define D	ymm3
+%define E	ymm4
+%define F	ymm5
+%define G	ymm6
+%define H	ymm7
+
+%define TMP0	ymm8
+%define TMP1	ymm9
+%define TMP2	ymm10
+
+; W(j) = WB(j) + WB(j+4)
+; Keep WB(j) - W(j+4) to reduce momory read
+%define Wj0	ymm11
+%define Wj1	ymm12
+%define Wj2	ymm13
+%define Wj3	ymm14
+%define Wj4	ymm15
+
+
+%define SZ8	8*SM3_DIGEST_WORD_SIZE	; Size of one vector register
+%define PTR_SZ                  8
+%define SM3_DIGEST_WORD_SIZE	4
+%define MAX_SM3_LANES		8
+%define NUM_SM3_DIGEST_WORDS	8
+%define SM3_DIGEST_ROW_SIZE	(MAX_SM3_LANES * SM3_DIGEST_WORD_SIZE)
+
+; Define stack usage
+
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESZ mod 32 must be 32-8 = 24
+struc stack_frame
+  .data		resb	16*SZ8
+  .digest	resb	8*SZ8
+  .wbtmp	resb	69*SZ8
+  .rsp		resb	8
+endstruc
+%define FRAMESZ	stack_frame_size
+%define _DIGEST	stack_frame.digest
+%define _WBTMP	stack_frame.wbtmp
+%define _RSP_SAVE	stack_frame.rsp
+
+%define YTMP0	rsp + _WBTMP + 0*SZ8
+%define YTMP1	rsp + _WBTMP + 1*SZ8
+%define YTMP2	rsp + _WBTMP + 2*SZ8
+%define YTMP3	rsp + _WBTMP + 3*SZ8
+%define YTMP4	rsp + _WBTMP + 4*SZ8
+
+%define YTMPI	rsp + _WBTMP + I*SZ8
+%define YTMPI_1 rsp + _WBTMP + (I - 1)*SZ8
+%define YTMPI_2 rsp + _WBTMP + (I - 2)*SZ8
+%define YTMPI_4 rsp + _WBTMP + (I - 4)*SZ8
+%define YTMPI5 	rsp + _WBTMP + (I + 5)*SZ8
+
+
+%define VMOVPS	vmovups
+
+;;;;;;;;
+; same as sha256
+;;;;;;;;
+%macro TRANSPOSE8 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+	; process top half (r0..r3) {a...d}
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {b5 b4 a5 a4   b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {b7 b6 a7 a6   b3 b2 a3 a2}
+	vshufps %%t1, %%r2, %%r3, 0x44	; t1 = {d5 d4 c5 c4   d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {d7 d6 c7 c6   d3 d2 c3 c2}
+	vshufps	%%r3, %%t0, %%t1, 0xDD	; r3 = {d5 c5 b5 a5   d1 c1 b1 a1}
+	vshufps	%%r1, %%r0, %%r2, 0x88	; r1 = {d6 c6 b6 a6   d2 c2 b2 a2}
+	vshufps	%%r0, %%r0, %%r2, 0xDD	; r0 = {d7 c7 b7 a7   d3 c3 b3 a3}
+	vshufps	%%t0, %%t0, %%t1, 0x88	; t0 = {d4 c4 b4 a4   d0 c0 b0 a0}
+
+	; use r2 in place of t0
+	; process bottom half (r4..r7) {e...h}
+	vshufps	%%r2, %%r4, %%r5, 0x44	; r2 = {f5 f4 e5 e4   f1 f0 e1 e0}
+	vshufps	%%r4, %%r4, %%r5, 0xEE	; r4 = {f7 f6 e7 e6   f3 f2 e3 e2}
+	vshufps %%t1, %%r6, %%r7, 0x44	; t1 = {h5 h4 g5 g4   h1 h0 g1 g0}
+	vshufps	%%r6, %%r6, %%r7, 0xEE	; r6 = {h7 h6 g7 g6   h3 h2 g3 g2}
+	vshufps	%%r7, %%r2, %%t1, 0xDD	; r7 = {h5 g5 f5 e5   h1 g1 f1 e1}
+	vshufps	%%r5, %%r4, %%r6, 0x88	; r5 = {h6 g6 f6 e6   h2 g2 f2 e2}
+	vshufps	%%r4, %%r4, %%r6, 0xDD	; r4 = {h7 g7 f7 e7   h3 g3 f3 e3}
+	vshufps	%%t1, %%r2, %%t1, 0x88	; t1 = {h4 g4 f4 e4   h0 g0 f0 e0}
+
+	vperm2f128	%%r6, %%r5, %%r1, 0x13	; h6...a6
+	vperm2f128	%%r2, %%r5, %%r1, 0x02	; h2...a2
+	vperm2f128	%%r5, %%r7, %%r3, 0x13	; h5...a5
+	vperm2f128	%%r1, %%r7, %%r3, 0x02	; h1...a1
+	vperm2f128	%%r7, %%r4, %%r0, 0x13	; h7...a7
+	vperm2f128	%%r3, %%r4, %%r0, 0x02	; h3...a3
+	vperm2f128	%%r4, %%t1, %%t0, 0x13	; h4...a4
+	vperm2f128	%%r0, %%t1, %%t0, 0x02	; h0...a0
+%endmacro
+
+%macro ROTATE_W 0
+
+	%xdefine TMP_ Wj0
+	%xdefine Wj0 Wj1
+	%xdefine Wj1 Wj2
+	%xdefine Wj2 Wj3
+	%xdefine Wj3 Wj4
+
+	%xdefine Wj4 TMP_
+
+%endmacro
+
+; ROTATE A,B,C,D
+%macro ROTATE_ARGS_AD 0
+
+	%xdefine TMP_ D
+	%xdefine D C
+	%xdefine C B
+	%xdefine B A
+	%xdefine A TMP2
+	%xdefine TMP2 TMP_
+
+%endmacro
+
+%macro ROTATE_ARGS_EH 0
+
+	%xdefine TMP_ H
+	%xdefine H G
+	%xdefine G F
+	%xdefine F E
+	%xdefine E TMP0
+	%xdefine TMP0 TMP_
+
+%endmacro
+
+%macro ROLD 3
+
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+	vpslld	%%tmp, %%reg, %%imm
+	vpsrld	%%reg, %%reg, (32-(%%imm))
+	vpor	%%reg, %%reg, %%tmp
+
+%endmacro
+
+%macro ROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+	vpslld	%%tmp, %%src, %%imm
+	vpsrld	%%reg, %%src, (32-(%%imm))
+	vpor	%%reg, %%reg, %%tmp
+%endmacro
+
+;; void sm3_x8_avx2(SM3_ARGS *args, uint64_t bytes);
+;; arg 1 : STATE : pointer to input data
+;; arg 2 : INP_SIZE  : size of input in blocks
+mk_global sm3_mb_x8_avx2,function,internal
+align 16
+sm3_mb_x8_avx2:
+	endbranch
+	; general registers preserved in outer calling routine
+	; outer calling routine saves all the YMM registers
+
+	; save rsp, allocate 32-byte aligned for local variables
+	mov	IDX, rsp
+	sub	rsp, FRAMESZ
+	and	rsp, ~31
+	mov	[rsp + _RSP_SAVE], IDX
+
+	lea	TBL,[TABLE]
+
+	;; load the address of each of the 8 message lanes
+	;; getting ready to transpose input onto stack
+	mov	inp0,[STATE + _args_data_ptr + 0*PTR_SZ]
+	mov	inp1,[STATE + _args_data_ptr + 1*PTR_SZ]
+	mov	inp2,[STATE + _args_data_ptr + 2*PTR_SZ]
+	mov	inp3,[STATE + _args_data_ptr + 3*PTR_SZ]
+	mov	inp4,[STATE + _args_data_ptr + 4*PTR_SZ]
+	mov	inp5,[STATE + _args_data_ptr + 5*PTR_SZ]
+	mov	inp6,[STATE + _args_data_ptr + 6*PTR_SZ]
+	mov	inp7,[STATE + _args_data_ptr + 7*PTR_SZ]
+
+	xor	IDX, IDX
+
+%assign cur_loop 0
+lloop:
+
+	;
+	; Pre calculate the WB 0..68 an W 0..64
+	; It will better than calculate WB/W in round method
+	;
+	; 	ps : SHA256(AVX2) calculate WB/W in round method
+	;
+	; Pre calculation memory io time:
+	; 	read  : 68 + 3 * 52(read WB)
+	;	write : 52(write WB17..68)
+	; Round method calculation memory io time:
+	;	read  : 48 * 6(read 6 number of WB each round)
+	; 	write : 52 + 64(same as upper)
+	;
+	VMOVPS	WB0,[inp0+IDX]
+	VMOVPS	WB1,[inp1+IDX]
+	VMOVPS	WB2,[inp2+IDX]
+	VMOVPS	WB3,[inp3+IDX]
+	VMOVPS	WB4,[inp4+IDX]
+	VMOVPS	WB5,[inp5+IDX]
+	VMOVPS	WB6,[inp6+IDX]
+	VMOVPS	WB7,[inp7+IDX]
+
+	TRANSPOSE8 WB0, WB1, WB2, WB3, WB4, WB5, WB6, WB7, WBTMP0, WBTMP1
+	vmovdqa WBTMP0, [SHUF_MASK]
+	vpshufb WB0,WBTMP0
+	vpshufb WB1,WBTMP0
+	vpshufb WB2,WBTMP0
+	vpshufb WB3,WBTMP0
+	vpshufb WB4,WBTMP0
+	vpshufb WB5,WBTMP0
+	vpshufb WB6,WBTMP0
+	vpshufb WB7,WBTMP0
+
+	vmovdqa	[YTMP0], WB0
+	vmovdqa	[YTMP1], WB1
+
+	VMOVPS	WB8,[inp0+IDX + 32]
+	VMOVPS	WB9,[inp1+IDX + 32]
+	VMOVPS	WB10,[inp2+IDX + 32]
+	VMOVPS	WB11,[inp3+IDX + 32]
+	VMOVPS	WB12,[inp4+IDX + 32]
+	VMOVPS	WB13,[inp5+IDX + 32]
+	VMOVPS	WB14,[inp6+IDX + 32]
+	VMOVPS	WB15,[inp7+IDX + 32]
+
+	TRANSPOSE8 WB8, WB9, WB10, WB11, WB12, WB13, WB14, WB15, WBTMP2, WBTMP3
+	vmovdqa WBTMP2, [SHUF_MASK]
+	vpshufb WB8,WBTMP2
+	vpshufb WB9,WBTMP2
+	vpshufb WB10,WBTMP2
+	vpshufb WB11,WBTMP2
+	vpshufb WB12,WBTMP2
+	vpshufb WB13,WBTMP2
+	vpshufb WB14,WBTMP2
+	vpshufb WB15,WBTMP2
+
+; WB0 WB1 already saved
+%assign I 2
+%rep 14
+	vmovdqa	[YTMPI], APPEND(WB,I)
+%assign I (I+1)
+%endrep
+
+	vmovdqa	WB0 , [YTMP0]
+	vmovdqa	WB1 , [YTMP1]
+
+; Calculate WB 16...67
+%rep 52
+	%assign J (I % 16)
+	%assign J_1 ((I-1) % 16) ;tmp to use
+	%assign J_2 ((I-2) % 16) ;tmp to use
+	%assign J_3 ((I-3) % 16)
+	%assign J_4 ((I-4) % 16) ;tmp to use
+	%assign J_9 ((I-9) % 16)
+	%assign J_13 ((I-13) % 16)
+	%assign J_6 ((I-6) % 16)
+
+	ROLD_nd APPEND(WB,J_2),15,APPEND(WB,J_1),APPEND(WB,J_3)
+	vpxor  APPEND(WB,J),APPEND(WB,J_2)
+	vpxor  APPEND(WB,J),APPEND(WB,J_9)
+
+	ROLD_nd APPEND(WB,J_2),15,APPEND(WB,J_1),APPEND(WB,J)
+	ROLD_nd APPEND(WB,J_1),23,APPEND(WB,J_4),APPEND(WB,J)
+	vpxor  APPEND(WB,J),APPEND(WB,J_2)
+	vpxor  APPEND(WB,J),APPEND(WB,J_1)
+
+	ROLD_nd APPEND(WB,J_2),7,APPEND(WB,J_1),APPEND(WB,J_13)
+	vpxor  APPEND(WB,J),APPEND(WB,J_2)
+	vpxor  APPEND(WB,J),APPEND(WB,J_6)
+
+	vmovdqa	[YTMPI], APPEND(WB,J)
+
+	vmovdqa APPEND(WB,J_1), [YTMPI_1]
+	vmovdqa APPEND(WB,J_2), [YTMPI_2]
+	vmovdqa APPEND(WB,J_4), [YTMPI_4]
+
+	%assign I (I+1)
+%endrep
+
+	add	IDX, 4*4*4
+
+	; Every round need load A-H
+	; Because we pre calculate the WB
+	vmovdqu	A,[STATE + 0*SM3_DIGEST_ROW_SIZE]
+	vmovdqu	B,[STATE + 1*SM3_DIGEST_ROW_SIZE]
+	vmovdqu	C,[STATE + 2*SM3_DIGEST_ROW_SIZE]
+	vmovdqu	D,[STATE + 3*SM3_DIGEST_ROW_SIZE]
+	vmovdqu	E,[STATE + 4*SM3_DIGEST_ROW_SIZE]
+	vmovdqu	F,[STATE + 5*SM3_DIGEST_ROW_SIZE]
+	vmovdqu	G,[STATE + 6*SM3_DIGEST_ROW_SIZE]
+	vmovdqu	H,[STATE + 7*SM3_DIGEST_ROW_SIZE]
+
+	vmovdqa Wj0, [YTMP0]
+	vmovdqa Wj1, [YTMP1]
+	vmovdqa Wj2, [YTMP2]
+	vmovdqa Wj3, [YTMP3]
+	vmovdqa Wj4, [YTMP4]
+
+
+%assign I 0
+%rep 16
+
+	; SS1 - TMP1
+	ROLD_nd TMP0,12,TMP1,A
+	vmovdqa TMP1, [TBL + (I*32)]
+	vpaddd TMP1,E
+	vpaddd TMP1,TMP0
+	ROLD TMP1,7,TMP2
+
+	; SS2 - TMP2
+	vpxor TMP2,TMP1,TMP0
+
+	; TT1
+	vpxor TMP0,A,B
+	vpxor TMP0,C
+	vpaddd TMP2,TMP0
+	vpaddd TMP2,D
+	vpxor TMP0,Wj0,Wj4
+	vpaddd TMP2,TMP0
+
+	ROLD B,9,TMP0
+
+	; Rotate a,b,c,d first
+	; after P0(TT2) , Wj0 will be relase
+	ROTATE_ARGS_AD
+
+	; P0(TT2)
+	vpxor TMP0,E,F
+	vpxor TMP0,G
+	vpaddd TMP0,H
+	vpaddd TMP0,TMP1
+	vpaddd TMP0,Wj0
+
+	ROLD_nd TMP1,9,TMP2,TMP0
+	ROLD_nd Wj0,17,TMP2,TMP0
+
+	vpxor TMP0,TMP1
+	vpxor TMP0,Wj0
+
+	ROLD F,19,TMP2
+
+	ROTATE_ARGS_EH
+
+	ROTATE_W
+
+	vmovdqa Wj4, [YTMPI5]
+	%assign I (I+1)
+%endrep
+
+%rep 48
+	; SS1 - TMP1
+	ROLD_nd TMP0,12,TMP1,A
+	vmovdqa TMP1, [TBL + (I*32)]
+	vpaddd TMP1,E
+	vpaddd TMP1,TMP0
+	ROLD TMP1,7,TMP2
+
+	; SS2 - TMP2
+	vpxor TMP2,TMP1,TMP0
+
+	; SS2 + D first
+	; D will be release
+	; FF16/GG16 diff with FF64/GG64
+	; So the register which keep D should be release before calculate TT1
+	vpaddd TMP2,D
+
+	; TT1
+	vpor TMP0,A,B
+	vpand TMP0,C
+	vpand D,A,B
+	vpor TMP0,D
+
+	vpaddd TMP2,TMP0
+	vpxor TMP0,Wj0,Wj4
+	vpaddd TMP2,TMP0
+
+	ROLD B,9,TMP0
+
+	ROTATE_ARGS_AD
+
+	; P0(TT2)
+	vpaddd TMP1,H
+	vpaddd TMP1,Wj0
+
+	vpand TMP0,E,F
+	vpandn Wj0,E,G
+	vpor TMP0,Wj0
+
+	vpaddd TMP0,TMP1
+
+	ROLD_nd TMP1,9,TMP2,TMP0
+	ROLD_nd Wj0,17,TMP2,TMP0
+
+	vpxor TMP0,TMP1
+	vpxor TMP0,Wj0
+
+	ROLD F,19,TMP2
+
+	ROTATE_ARGS_EH
+
+	ROTATE_W
+	vmovdqa Wj4, [YTMPI5]
+	%assign I (I+1)
+%endrep
+
+	vpxor	A, A, [STATE + 0*SM3_DIGEST_ROW_SIZE]
+        vpxor	B, B, [STATE + 1*SM3_DIGEST_ROW_SIZE]
+        vpxor	C, C, [STATE + 2*SM3_DIGEST_ROW_SIZE]
+        vpxor	D, D, [STATE + 3*SM3_DIGEST_ROW_SIZE]
+        vpxor	E, E, [STATE + 4*SM3_DIGEST_ROW_SIZE]
+        vpxor	F, F, [STATE + 5*SM3_DIGEST_ROW_SIZE]
+        vpxor	G, G, [STATE + 6*SM3_DIGEST_ROW_SIZE]
+        vpxor	H, H, [STATE + 7*SM3_DIGEST_ROW_SIZE]
+
+	; Write back to memory (state object) the transposed digest
+	vmovdqu	[STATE + 0*SM3_DIGEST_ROW_SIZE],A
+	vmovdqu	[STATE + 1*SM3_DIGEST_ROW_SIZE],B
+	vmovdqu	[STATE + 2*SM3_DIGEST_ROW_SIZE],C
+	vmovdqu	[STATE + 3*SM3_DIGEST_ROW_SIZE],D
+	vmovdqu	[STATE + 4*SM3_DIGEST_ROW_SIZE],E
+	vmovdqu	[STATE + 5*SM3_DIGEST_ROW_SIZE],F
+	vmovdqu	[STATE + 6*SM3_DIGEST_ROW_SIZE],G
+	vmovdqu	[STATE + 7*SM3_DIGEST_ROW_SIZE],H
+
+	sub 	SIZE, 1
+	je	last_loop
+	jmp	lloop
+
+last_loop:
+
+
+	; update input pointers
+	add	inp0, IDX
+	mov	[STATE + _args_data_ptr + 0*8], inp0
+	add	inp1, IDX
+	mov	[STATE + _args_data_ptr + 1*8], inp1
+	add	inp2, IDX
+	mov	[STATE + _args_data_ptr + 2*8], inp2
+	add	inp3, IDX
+	mov	[STATE + _args_data_ptr + 3*8], inp3
+	add	inp4, IDX
+	mov	[STATE + _args_data_ptr + 4*8], inp4
+	add	inp5, IDX
+	mov	[STATE + _args_data_ptr + 5*8], inp5
+	add	inp6, IDX
+	mov	[STATE + _args_data_ptr + 6*8], inp6
+	add	inp7, IDX
+	mov	[STATE + _args_data_ptr + 7*8], inp7
+
+	;;;;;;;;;;;;;;;;
+	;; Postamble
+	mov	rsp, [rsp + _RSP_SAVE]
+	ret
+
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+			 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+align 64
+global TABLE
+TABLE:
+	dq 0x79cc451979cc4519,0x79cc451979cc4519
+	dq 0x79cc451979cc4519,0x79cc451979cc4519
+	dq 0xf3988a32f3988a32,0xf3988a32f3988a32
+	dq 0xf3988a32f3988a32,0xf3988a32f3988a32
+	dq 0xe7311465e7311465,0xe7311465e7311465
+	dq 0xe7311465e7311465,0xe7311465e7311465
+	dq 0xce6228cbce6228cb,0xce6228cbce6228cb
+	dq 0xce6228cbce6228cb,0xce6228cbce6228cb
+	dq 0x9cc451979cc45197,0x9cc451979cc45197
+	dq 0x9cc451979cc45197,0x9cc451979cc45197
+	dq 0x3988a32f3988a32f,0x3988a32f3988a32f
+	dq 0x3988a32f3988a32f,0x3988a32f3988a32f
+	dq 0x7311465e7311465e,0x7311465e7311465e
+	dq 0x7311465e7311465e,0x7311465e7311465e
+	dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
+	dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
+	dq 0xcc451979cc451979,0xcc451979cc451979
+	dq 0xcc451979cc451979,0xcc451979cc451979
+	dq 0x988a32f3988a32f3,0x988a32f3988a32f3
+	dq 0x988a32f3988a32f3,0x988a32f3988a32f3
+	dq 0x311465e7311465e7,0x311465e7311465e7
+	dq 0x311465e7311465e7,0x311465e7311465e7
+	dq 0x6228cbce6228cbce,0x6228cbce6228cbce
+	dq 0x6228cbce6228cbce,0x6228cbce6228cbce
+	dq 0xc451979cc451979c,0xc451979cc451979c
+	dq 0xc451979cc451979c,0xc451979cc451979c
+	dq 0x88a32f3988a32f39,0x88a32f3988a32f39
+	dq 0x88a32f3988a32f39,0x88a32f3988a32f39
+	dq 0x11465e7311465e73,0x11465e7311465e73
+	dq 0x11465e7311465e73,0x11465e7311465e73
+	dq 0x228cbce6228cbce6,0x228cbce6228cbce6
+	dq 0x228cbce6228cbce6,0x228cbce6228cbce6
+	dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+	dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+	dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+	dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+	dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+	dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+	dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+	dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+	dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+	dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+	dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+	dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+	dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+	dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+	dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+	dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+	dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+	dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+	dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+	dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+	dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+	dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+	dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+	dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+	dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+	dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+	dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+	dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+	dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+	dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+	dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+	dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+	dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
+	dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
+	dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
+	dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
+	dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
+	dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
+	dq 0xd43cec53d43cec53,0xd43cec53d43cec53
+	dq 0xd43cec53d43cec53,0xd43cec53d43cec53
+	dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
+	dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
+	dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
+	dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
+	dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
+	dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
+	dq 0x43cec53d43cec53d,0x43cec53d43cec53d
+	dq 0x43cec53d43cec53d,0x43cec53d43cec53d
+	dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
+	dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
+	dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
+	dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
+	dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
+	dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
+	dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
+	dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
+	dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
+	dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
+	dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
+	dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
+	dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
+	dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
+	dq 0xcec53d43cec53d43,0xcec53d43cec53d43
+	dq 0xcec53d43cec53d43,0xcec53d43cec53d43
+	dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+	dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+	dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+	dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+	dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+	dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+	dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+	dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+	dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+	dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+	dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+	dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+	dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+	dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+	dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+	dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+	dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+	dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+	dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+	dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+	dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+	dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+	dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+	dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+	dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+	dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+	dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+	dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+	dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+	dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+	dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+	dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+
+SHUF_MASK:	dq 0x0405060700010203,0x0c0d0e0f08090a0b
+		dq 0x0405060700010203,0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm
new file mode 100644
index 000000000..482876539
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm
@@ -0,0 +1,81 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+default rel
+[bits 64]
+
+extern sm3_ctx_mgr_init_base
+extern sm3_ctx_mgr_submit_base
+extern sm3_ctx_mgr_flush_base
+
+extern sm3_ctx_mgr_init_avx2
+extern sm3_ctx_mgr_submit_avx2
+extern sm3_ctx_mgr_flush_avx2
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern sm3_ctx_mgr_init_avx512
+ extern sm3_ctx_mgr_submit_avx512
+ extern sm3_ctx_mgr_flush_avx512
+%endif
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface sm3_ctx_mgr_init
+mbin_interface sm3_ctx_mgr_submit
+mbin_interface sm3_ctx_mgr_flush
+
+;; have not imlement see/avx yet
+%ifdef HAVE_AS_KNOWS_AVX512
+  mbin_dispatch_init6 sm3_ctx_mgr_init, sm3_ctx_mgr_init_base, \
+	sm3_ctx_mgr_init_base, sm3_ctx_mgr_init_base, sm3_ctx_mgr_init_avx2, \
+	sm3_ctx_mgr_init_avx512
+  mbin_dispatch_init6 sm3_ctx_mgr_submit, sm3_ctx_mgr_submit_base, \
+	sm3_ctx_mgr_submit_base, sm3_ctx_mgr_submit_base, sm3_ctx_mgr_submit_avx2, \
+	sm3_ctx_mgr_submit_avx512
+  mbin_dispatch_init6 sm3_ctx_mgr_flush, sm3_ctx_mgr_flush_base, \
+	sm3_ctx_mgr_flush_base, sm3_ctx_mgr_flush_base, sm3_ctx_mgr_flush_avx2, \
+	sm3_ctx_mgr_flush_avx512
+%else
+  mbin_dispatch_init sm3_ctx_mgr_init, sm3_ctx_mgr_init_base, \
+	sm3_ctx_mgr_init_base,sm3_ctx_mgr_init_avx2
+  mbin_dispatch_init sm3_ctx_mgr_submit, sm3_ctx_mgr_submit_base, \
+	sm3_ctx_mgr_submit_base,sm3_ctx_mgr_submit_avx2
+  mbin_dispatch_init sm3_ctx_mgr_flush, sm3_ctx_mgr_flush_base, \
+	sm3_ctx_mgr_flush_base,sm3_ctx_mgr_flush_avx2
+%endif
+
+;;;       func  			core, ver, snum
+slversion sm3_ctx_mgr_init,  	00,   00, 2300
+slversion sm3_ctx_mgr_submit,	00,   00, 2301
+slversion sm3_ctx_mgr_flush, 	00,   00, 2302
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c
new file mode 100644
index 000000000..be56350b3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c
@@ -0,0 +1,207 @@
+/**********************************************************************
+  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define ISAL_UNIT_TEST
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sm3_mb.h"
+#include "endian_helper.h"
+
+typedef uint32_t digest_sm3[SM3_DIGEST_NWORDS];
+
+#define MSGS 2
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+static uint8_t msg1[] = "abc";
+static uint8_t msg2[] = "abcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd";
+
+/* small endian */
+static digest_sm3 exp_result_digest1 = { 0x66c7f0f4, 0x62eeedd9, 0xd1f2d46b, 0xdc10e4e2,
+	0x4167c487, 0x5cf2f7a2, 0x297da02b, 0x8f4ba8e0
+};
+
+/* small endian */
+static digest_sm3 exp_result_digest2 = { 0xdebe9ff9, 0x2275b8a1, 0x38604889, 0xc18e5a4d,
+	0x6fdb70e5, 0x387e5765, 0x293dcba3, 0x9c0c5732
+};
+
+static uint8_t *msgs[MSGS] = { msg1, msg2 };
+
+static uint32_t *exp_result_digest[MSGS] = {
+	exp_result_digest1, exp_result_digest2
+};
+
+int main(void)
+{
+	SM3_HASH_CTX_MGR *mgr = NULL;
+	SM3_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+	uint32_t i, j, k, t, checked = 0;
+	uint32_t *good;
+	int ret;
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sm3_ctx_mgr_init(mgr);
+
+	// Init contexts before first use
+	for (i = 0; i < MSGS; i++) {
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	for (i = 0; i < MSGS; i++) {
+		ctx = sm3_ctx_mgr_submit(mgr,
+					 &ctxpool[i],
+					 msgs[i], strlen((char *)msgs[i]), HASH_ENTIRE);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			good = exp_result_digest[t];
+			checked++;
+			for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+				if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j],
+					       byteswap32(good[j]));
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the submit."
+				       " Error code: %d", ctx->error);
+				return -1;
+			}
+
+		}
+	}
+
+	while (1) {
+		ctx = sm3_ctx_mgr_flush(mgr);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			good = exp_result_digest[t];
+			checked++;
+			for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+				if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j],
+					       byteswap32(good[j]));
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the submit."
+				       " Error code: %d", ctx->error);
+				return -1;
+			}
+		} else {
+			break;
+		}
+	}
+
+	// do larger test in pseudo-random order
+
+	// Init contexts before first use
+	for (i = 0; i < NUM_JOBS; i++) {
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)((uint64_t) i);
+	}
+
+	checked = 0;
+	for (i = 0; i < NUM_JOBS; i++) {
+		j = PSEUDO_RANDOM_NUM(i);
+		ctx = sm3_ctx_mgr_submit(mgr,
+					 &ctxpool[i],
+					 msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+			good = exp_result_digest[k];
+			checked++;
+			for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+				if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j],
+					       byteswap32(good[j]));
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the"
+				       " submit. Error code: %d", ctx->error);
+				return -1;
+			}
+		}
+	}
+	while (1) {
+		ctx = sm3_ctx_mgr_flush(mgr);
+
+		if (ctx) {
+			t = (unsigned long)(ctx->user_data);
+			k = PSEUDO_RANDOM_NUM(t);
+			good = exp_result_digest[k];
+			checked++;
+			for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+				if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) {
+					printf("Test %d, digest %d is %08X, should be %08X\n",
+					       t, j, ctxpool[t].job.result_digest[j],
+					       byteswap32(good[j]));
+					return -1;
+				}
+			}
+
+			if (ctx->error) {
+				printf("Something bad happened during the submit."
+				       " Error code: %d", ctx->error);
+				return -1;
+			}
+		} else {
+			break;
+		}
+	}
+
+	if (checked != NUM_JOBS) {
+		printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+		return -1;
+	}
+
+	printf(" multibinary_sm3 test: Pass\n");
+
+	return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c
new file mode 100644
index 000000000..4c0c54436
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c
@@ -0,0 +1,45 @@
+/**********************************************************************
+  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <openssl/evp.h>
+
+void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest)
+{
+	EVP_MD_CTX *md_ctx;
+	const EVP_MD *md;
+	unsigned int md_len;
+
+	md = EVP_sm3();
+	md_ctx = EVP_MD_CTX_new();
+	EVP_DigestInit_ex(md_ctx, md, NULL);
+	EVP_DigestUpdate(md_ctx, buf, length);
+	EVP_DigestFinal_ex(md_ctx, digest, &md_len);
+	EVP_MD_CTX_free(md_ctx);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile b/src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile
new file mode 100644
index 000000000..964baee11
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile
@@ -0,0 +1,19 @@
+CFLAGS += -I ../../include
+libs += ../../bin/isa-l_crypto.a
+tests = $(patsubst %test.c, %test, $(wildcard *_test.c))
+
+tests: $(tests)
+$(tests): $(libs)
+%test: %test.c
+	$(CC) $< $(libs) $(CFLAGS) $(LDLIBS) -o $@
+$(libs):
+	$(MAKE) -C ../../ -f Makefile.unx
+test: $(addsuffix .run,$(tests))
+	@echo ALL PASS
+$(addsuffix .run,$(tests)): %.run: %
+	$(SIM) ./$<
+	@echo Completed run: $<
+clean:
+	$(RM) *.o $(tests)
+
+$(tests): LDLIBS += -lcrypto
diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile.nmake b/src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile.nmake
new file mode 100644
index 000000000..daaf04e79
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile.nmake
@@ -0,0 +1,58 @@
+########################################################################
+#  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+tests 		= 	md5_mb_over_4GB_test.exe sha1_mb_over_4GB_test.exe \
+sha256_mb_over_4GB_test.exe sha512_mb_over_4GB_test.exe
+
+INCLUDES 	= 	-I../../include
+LINKFLAGS 	= 	/nologo
+INCLUDES 	= 	$(INCLUDES) -Ic:\OpenSSL-Win64\include
+CFLAGS   	= 	-O2 -D NDEBUG /nologo -D_USE_MATH_DEFINES -Qstd=c99 $(INCLUDES) /c
+LINKFLAGS 	= 	$(LINKFLAGS) /libpath:c:\OpenSSL-Win64\lib
+LIBS 		= 	../../isa-l_crypto.lib
+DLL 		= 	isa-l_crypto.dll
+
+tests: lib $(tests)
+$(tests): $(@B).obj
+	link /out:$@ $(LINKFLAGS) libeay32.lib $(LIBS) $*.obj
+%.obj: %.c
+	$(CC) $(CFLAGS) -Fo$@ $?
+
+lib:
+	cd ../../ && nmake -f Makefile.nmake
+	cd ../../ && copy $(DLL) "tests\extended"
+
+test: $(tests)
+	!$?
+	echo ALL PASS
+
+clean:
+	-if exist *.obj del *.obj
+	-if exist *.exe del *.exe
+	-if exist *.dll del *.dll
diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/md5_mb_over_4GB_test.c b/src/crypto/isa-l/isa-l_crypto/tests/extended/md5_mb_over_4GB_test.c
new file mode 100644
index 000000000..5eb7be75f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/md5_mb_over_4GB_test.c
@@ -0,0 +1,155 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "md5_mb.h"
+#include "endian_helper.h"
+#include <openssl/md5.h>
+#define TEST_LEN  		(1024*1024ull)	//1M
+#define TEST_BUFS 		MD5_MIN_LANES
+#define ROTATION_TIMES 		10000	//total length processing = TEST_LEN * ROTATION_TIMES
+#define UPDATE_SIZE		(13*MD5_BLOCK_SIZE)
+#define LEN_TOTAL		(TEST_LEN * ROTATION_TIMES)
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ref_upd[4 * MD5_DIGEST_NWORDS];
+
+struct user_data {
+	int idx;
+	uint64_t processed;
+};
+
+int main(void)
+{
+	MD5_CTX o_ctx;		//openSSL
+	MD5_HASH_CTX_MGR *mgr = NULL;
+	MD5_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+	uint32_t i, j, k, fail = 0;
+	unsigned char *bufs[TEST_BUFS];
+	struct user_data udata[TEST_BUFS];
+	int ret;
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+	md5_ctx_mgr_init(mgr);
+
+	printf("md5_large_test\n");
+
+	// Init ctx contents
+	for (i = 0; i < TEST_BUFS; i++) {
+		bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)&udata[i];
+	}
+
+	//Openssl MD5 update test
+	MD5_Init(&o_ctx);
+	for (k = 0; k < ROTATION_TIMES; k++) {
+		MD5_Update(&o_ctx, bufs[k % TEST_BUFS], TEST_LEN);
+	}
+	MD5_Final(digest_ref_upd, &o_ctx);
+
+	// Initialize pool
+	for (i = 0; i < TEST_BUFS; i++) {
+		struct user_data *u = (struct user_data *)ctxpool[i].user_data;
+		u->idx = i;
+		u->processed = 0;
+	}
+
+	printf("Starting updates\n");
+	int highest_pool_idx = 0;
+	ctx = &ctxpool[highest_pool_idx++];
+	while (ctx) {
+		int len = UPDATE_SIZE;
+		int update_type = HASH_UPDATE;
+		struct user_data *u = (struct user_data *)ctx->user_data;
+		int idx = u->idx;
+
+		if (u->processed == 0)
+			update_type = HASH_FIRST;
+
+		else if (hash_ctx_complete(ctx)) {
+			if (highest_pool_idx < TEST_BUFS)
+				ctx = &ctxpool[highest_pool_idx++];
+			else
+				ctx = md5_ctx_mgr_flush(mgr);
+			continue;
+		} else if (u->processed >= (LEN_TOTAL - UPDATE_SIZE)) {
+			len = (LEN_TOTAL - u->processed);
+			update_type = HASH_LAST;
+		}
+		u->processed += len;
+		ctx = md5_ctx_mgr_submit(mgr, ctx, bufs[idx], len, update_type);
+
+		if (NULL == ctx) {
+			if (highest_pool_idx < TEST_BUFS)
+				ctx = &ctxpool[highest_pool_idx++];
+			else
+				ctx = md5_ctx_mgr_flush(mgr);
+		}
+	}
+
+	printf("multibuffer md5 digest: \n");
+	for (i = 0; i < TEST_BUFS; i++) {
+		printf("Total processing size of buf[%d] is %ld \n", i,
+		       ctxpool[i].total_length);
+		for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+			printf("digest%d : %08X\n", j, ctxpool[i].job.result_digest[j]);
+		}
+	}
+	printf("\n");
+
+	printf("openssl md5 update digest: \n");
+	for (i = 0; i < MD5_DIGEST_NWORDS; i++)
+		printf("%08X - ", to_le32(((uint32_t *) digest_ref_upd)[i]));
+	printf("\n");
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_le32(((uint32_t *) digest_ref_upd)[j])) {
+				fail++;
+			}
+		}
+	}
+
+	if (fail)
+		printf("Test failed md5 hash large file check %d\n", fail);
+	else
+		printf(" md5_hash_large_test: Pass\n");
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/sha1_mb_over_4GB_test.c b/src/crypto/isa-l/isa-l_crypto/tests/extended/sha1_mb_over_4GB_test.c
new file mode 100644
index 000000000..af94a8098
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/sha1_mb_over_4GB_test.c
@@ -0,0 +1,156 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha1_mb.h"
+#include "endian_helper.h"
+#include <openssl/sha.h>
+#define TEST_LEN  		(1024*1024ull)	//1M
+#define TEST_BUFS 		SHA1_MIN_LANES
+#define ROTATION_TIMES 		10000	//total length processing = TEST_LEN * ROTATION_TIMES
+#define UPDATE_SIZE		(13*SHA1_BLOCK_SIZE)
+#define LEN_TOTAL		(TEST_LEN * ROTATION_TIMES)
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ref_upd[4 * SHA1_DIGEST_NWORDS];
+
+struct user_data {
+	int idx;
+	uint64_t processed;
+};
+
+int main(void)
+{
+	SHA_CTX o_ctx;		//openSSL
+	SHA1_HASH_CTX_MGR *mgr = NULL;
+	SHA1_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+	uint32_t i, j, k, fail = 0;
+	unsigned char *bufs[TEST_BUFS];
+	struct user_data udata[TEST_BUFS];
+	int ret;
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha1_ctx_mgr_init(mgr);
+
+	printf("sha1_large_test\n");
+
+	// Init ctx contents
+	for (i = 0; i < TEST_BUFS; i++) {
+		bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)&udata[i];
+	}
+
+	//Openssl SHA1 update test
+	SHA1_Init(&o_ctx);
+	for (k = 0; k < ROTATION_TIMES; k++) {
+		SHA1_Update(&o_ctx, bufs[k % TEST_BUFS], TEST_LEN);
+	}
+	SHA1_Final(digest_ref_upd, &o_ctx);
+
+	// Initialize pool
+	for (i = 0; i < TEST_BUFS; i++) {
+		struct user_data *u = (struct user_data *)ctxpool[i].user_data;
+		u->idx = i;
+		u->processed = 0;
+	}
+
+	printf("Starting updates\n");
+	int highest_pool_idx = 0;
+	ctx = &ctxpool[highest_pool_idx++];
+	while (ctx) {
+		int len = UPDATE_SIZE;
+		int update_type = HASH_UPDATE;
+		struct user_data *u = (struct user_data *)ctx->user_data;
+		int idx = u->idx;
+
+		if (u->processed == 0)
+			update_type = HASH_FIRST;
+
+		else if (hash_ctx_complete(ctx)) {
+			if (highest_pool_idx < TEST_BUFS)
+				ctx = &ctxpool[highest_pool_idx++];
+			else
+				ctx = sha1_ctx_mgr_flush(mgr);
+			continue;
+		} else if (u->processed >= (LEN_TOTAL - UPDATE_SIZE)) {
+			len = (LEN_TOTAL - u->processed);
+			update_type = HASH_LAST;
+		}
+		u->processed += len;
+		ctx = sha1_ctx_mgr_submit(mgr, ctx, bufs[idx], len, update_type);
+
+		if (NULL == ctx) {
+			if (highest_pool_idx < TEST_BUFS)
+				ctx = &ctxpool[highest_pool_idx++];
+			else
+				ctx = sha1_ctx_mgr_flush(mgr);
+		}
+	}
+
+	printf("multibuffer SHA1 digest: \n");
+	for (i = 0; i < TEST_BUFS; i++) {
+		printf("Total processing size of buf[%d] is %ld \n", i,
+		       ctxpool[i].total_length);
+		for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+			printf("digest%d : %08X\n", j, ctxpool[i].job.result_digest[j]);
+		}
+	}
+	printf("\n");
+
+	printf("openssl SHA1 update digest: \n");
+	for (i = 0; i < SHA1_DIGEST_NWORDS; i++)
+		printf("%08X - ", to_be32(((uint32_t *) digest_ref_upd)[i]));
+	printf("\n");
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_be32(((uint32_t *) digest_ref_upd)[j])) {
+				fail++;
+			}
+		}
+	}
+
+	if (fail)
+		printf("Test failed SHA1 hash large file check %d\n", fail);
+	else
+		printf(" SHA1_hash_large_test: Pass\n");
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/sha256_mb_over_4GB_test.c b/src/crypto/isa-l/isa-l_crypto/tests/extended/sha256_mb_over_4GB_test.c
new file mode 100644
index 000000000..35bbdcbae
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/sha256_mb_over_4GB_test.c
@@ -0,0 +1,156 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha256_mb.h"
+#include "endian_helper.h"
+#include <openssl/sha.h>
+#define TEST_LEN  		(1024*1024ull)	//1M
+#define TEST_BUFS 		SHA256_MIN_LANES
+#define ROTATION_TIMES 		10000	//total length processing = TEST_LEN * ROTATION_TIMES
+#define UPDATE_SIZE		(13*SHA256_BLOCK_SIZE)
+#define LEN_TOTAL		(TEST_LEN * ROTATION_TIMES)
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ref_upd[4 * SHA256_DIGEST_NWORDS];
+
+struct user_data {
+	int idx;
+	uint64_t processed;
+};
+
+int main(void)
+{
+	SHA256_CTX o_ctx;	//openSSL
+	SHA256_HASH_CTX_MGR *mgr = NULL;
+	SHA256_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+	uint32_t i, j, k, fail = 0;
+	unsigned char *bufs[TEST_BUFS];
+	struct user_data udata[TEST_BUFS];
+	int ret;
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha256_ctx_mgr_init(mgr);
+
+	printf("sha256_large_test\n");
+
+	// Init ctx contents
+	for (i = 0; i < TEST_BUFS; i++) {
+		bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)&udata[i];
+	}
+
+	//Openssl SHA256 update test
+	SHA256_Init(&o_ctx);
+	for (k = 0; k < ROTATION_TIMES; k++) {
+		SHA256_Update(&o_ctx, bufs[k % TEST_BUFS], TEST_LEN);
+	}
+	SHA256_Final(digest_ref_upd, &o_ctx);
+
+	// Initialize pool
+	for (i = 0; i < TEST_BUFS; i++) {
+		struct user_data *u = (struct user_data *)ctxpool[i].user_data;
+		u->idx = i;
+		u->processed = 0;
+	}
+
+	printf("Starting updates\n");
+	int highest_pool_idx = 0;
+	ctx = &ctxpool[highest_pool_idx++];
+	while (ctx) {
+		int len = UPDATE_SIZE;
+		int update_type = HASH_UPDATE;
+		struct user_data *u = (struct user_data *)ctx->user_data;
+		int idx = u->idx;
+
+		if (u->processed == 0)
+			update_type = HASH_FIRST;
+
+		else if (hash_ctx_complete(ctx)) {
+			if (highest_pool_idx < TEST_BUFS)
+				ctx = &ctxpool[highest_pool_idx++];
+			else
+				ctx = sha256_ctx_mgr_flush(mgr);
+			continue;
+		} else if (u->processed >= (LEN_TOTAL - UPDATE_SIZE)) {
+			len = (LEN_TOTAL - u->processed);
+			update_type = HASH_LAST;
+		}
+		u->processed += len;
+		ctx = sha256_ctx_mgr_submit(mgr, ctx, bufs[idx], len, update_type);
+
+		if (NULL == ctx) {
+			if (highest_pool_idx < TEST_BUFS)
+				ctx = &ctxpool[highest_pool_idx++];
+			else
+				ctx = sha256_ctx_mgr_flush(mgr);
+		}
+	}
+
+	printf("multibuffer SHA256 digest: \n");
+	for (i = 0; i < TEST_BUFS; i++) {
+		printf("Total processing size of buf[%d] is %ld \n", i,
+		       ctxpool[i].total_length);
+		for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+			printf("digest%d : %08X\n", j, ctxpool[i].job.result_digest[j]);
+		}
+	}
+	printf("\n");
+
+	printf("openssl SHA256 update digest: \n");
+	for (i = 0; i < SHA256_DIGEST_NWORDS; i++)
+		printf("%08X - ", to_be32(((uint32_t *) digest_ref_upd)[i]));
+	printf("\n");
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_be32(((uint32_t *) digest_ref_upd)[j])) {
+				fail++;
+			}
+		}
+	}
+
+	if (fail)
+		printf("Test failed SHA256 hash large file check %d\n", fail);
+	else
+		printf(" SHA256_hash_large_test: Pass\n");
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/sha512_mb_over_4GB_test.c b/src/crypto/isa-l/isa-l_crypto/tests/extended/sha512_mb_over_4GB_test.c
new file mode 100644
index 000000000..9c2aeaead
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/sha512_mb_over_4GB_test.c
@@ -0,0 +1,156 @@
+/**********************************************************************
+  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha512_mb.h"
+#include "endian_helper.h"
+#include <openssl/sha.h>
+#define TEST_LEN  		(1024*1024ull)	//1M
+#define TEST_BUFS 		SHA512_MIN_LANES
+#define ROTATION_TIMES 		10000	//total length processing = TEST_LEN * ROTATION_TIMES
+#define UPDATE_SIZE		(13*SHA512_BLOCK_SIZE)
+#define LEN_TOTAL		(TEST_LEN * ROTATION_TIMES)
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ref_upd[8 * SHA512_DIGEST_NWORDS];
+
+struct user_data {
+	int idx;
+	uint64_t processed;
+};
+
+int main(void)
+{
+	SHA512_CTX o_ctx;	//openSSL
+	SHA512_HASH_CTX_MGR *mgr = NULL;
+	SHA512_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+	uint32_t i, j, k, fail = 0;
+	unsigned char *bufs[TEST_BUFS];
+	struct user_data udata[TEST_BUFS];
+	int ret;
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sha512_ctx_mgr_init(mgr);
+
+	printf("sha512_large_test\n");
+
+	// Init ctx contents
+	for (i = 0; i < TEST_BUFS; i++) {
+		bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)&udata[i];
+	}
+
+	//Openssl SHA512 update test
+	SHA512_Init(&o_ctx);
+	for (k = 0; k < ROTATION_TIMES; k++) {
+		SHA512_Update(&o_ctx, bufs[k % TEST_BUFS], TEST_LEN);
+	}
+	SHA512_Final(digest_ref_upd, &o_ctx);
+
+	// Initialize pool
+	for (i = 0; i < TEST_BUFS; i++) {
+		struct user_data *u = (struct user_data *)ctxpool[i].user_data;
+		u->idx = i;
+		u->processed = 0;
+	}
+
+	printf("Starting updates\n");
+	int highest_pool_idx = 0;
+	ctx = &ctxpool[highest_pool_idx++];
+	while (ctx) {
+		int len = UPDATE_SIZE;
+		int update_type = HASH_UPDATE;
+		struct user_data *u = (struct user_data *)ctx->user_data;
+		int idx = u->idx;
+
+		if (u->processed == 0)
+			update_type = HASH_FIRST;
+
+		else if (hash_ctx_complete(ctx)) {
+			if (highest_pool_idx < TEST_BUFS)
+				ctx = &ctxpool[highest_pool_idx++];
+			else
+				ctx = sha512_ctx_mgr_flush(mgr);
+			continue;
+		} else if (u->processed >= (LEN_TOTAL - UPDATE_SIZE)) {
+			len = (LEN_TOTAL - u->processed);
+			update_type = HASH_LAST;
+		}
+		u->processed += len;
+		ctx = sha512_ctx_mgr_submit(mgr, ctx, bufs[idx], len, update_type);
+
+		if (NULL == ctx) {
+			if (highest_pool_idx < TEST_BUFS)
+				ctx = &ctxpool[highest_pool_idx++];
+			else
+				ctx = sha512_ctx_mgr_flush(mgr);
+		}
+	}
+
+	printf("multibuffer sha512 digest: \n");
+	for (i = 0; i < TEST_BUFS; i++) {
+		printf("Total processing size of buf[%d] is %ld \n", i,
+		       ctxpool[i].total_length);
+		for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+			printf("digest%d : %016lX\n", j, ctxpool[i].job.result_digest[j]);
+		}
+	}
+	printf("\n");
+
+	printf("openssl sha512 update digest: \n");
+	for (i = 0; i < SHA512_DIGEST_NWORDS; i++)
+		printf("%016lX - ", to_be64(((uint64_t *) digest_ref_upd)[i]));
+	printf("\n");
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_be64(((uint64_t *) digest_ref_upd)[j])) {
+				fail++;
+			}
+		}
+	}
+
+	if (fail)
+		printf("Test failed sha512 hash large file check %d\n", fail);
+	else
+		printf(" sha512_hash_large_test: Pass\n");
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/sm3_mb_over_4GB_test.c b/src/crypto/isa-l/isa-l_crypto/tests/extended/sm3_mb_over_4GB_test.c
new file mode 100644
index 000000000..ea98e29b5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/sm3_mb_over_4GB_test.c
@@ -0,0 +1,162 @@
+/**********************************************************************
+  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "endian_helper.h"
+#include <openssl/evp.h>
+
+#define TEST_LEN  		(1024*1024ull)	//1M
+#define TEST_BUFS 		SM3_MAX_LANES
+#define ROTATION_TIMES 		10000	//total length processing = TEST_LEN * ROTATION_TIMES
+#define UPDATE_SIZE		(13*SM3_BLOCK_SIZE)
+#define LEN_TOTAL		(TEST_LEN * ROTATION_TIMES)
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ref_upd[4 * SM3_DIGEST_NWORDS];
+
+struct user_data {
+	int idx;
+	uint64_t processed;
+};
+
+int main(void)
+{
+	SM3_HASH_CTX_MGR *mgr = NULL;
+	SM3_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+	uint32_t i, j, k, fail = 0;
+	unsigned char *bufs[TEST_BUFS];
+	struct user_data udata[TEST_BUFS];
+	EVP_MD_CTX *md_ctx;
+	const EVP_MD *md;
+	unsigned int md_len;
+	int ret;
+
+	ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+	if ((ret != 0) || (mgr == NULL)) {
+		printf("posix_memalign failed test aborted\n");
+		return 1;
+	}
+
+	sm3_ctx_mgr_init(mgr);
+
+	printf("sm3_large_test\n");
+
+	// Init ctx contents
+	for (i = 0; i < TEST_BUFS; i++) {
+		bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+		if (bufs[i] == NULL) {
+			printf("malloc failed test aborted\n");
+			return 1;
+		}
+		hash_ctx_init(&ctxpool[i]);
+		ctxpool[i].user_data = (void *)&udata[i];
+	}
+
+	//Openssl SM3 update test
+	md = EVP_sm3();
+	md_ctx = EVP_MD_CTX_new();
+	EVP_DigestInit_ex(md_ctx, md, NULL);
+	for (k = 0; k < ROTATION_TIMES; k++) {
+		EVP_DigestUpdate(md_ctx, bufs[k % TEST_BUFS], TEST_LEN);
+	}
+	EVP_DigestFinal_ex(md_ctx, digest_ref_upd, &md_len);
+	EVP_MD_CTX_free(md_ctx);
+
+	// Initialize pool
+	for (i = 0; i < TEST_BUFS; i++) {
+		struct user_data *u = (struct user_data *)ctxpool[i].user_data;
+		u->idx = i;
+		u->processed = 0;
+	}
+
+	printf("Starting updates\n");
+	int highest_pool_idx = 0;
+	ctx = &ctxpool[highest_pool_idx++];
+	while (ctx) {
+		int len = UPDATE_SIZE;
+		int update_type = HASH_UPDATE;
+		struct user_data *u = (struct user_data *)ctx->user_data;
+		int idx = u->idx;
+
+		if (u->processed == 0)
+			update_type = HASH_FIRST;
+
+		else if (hash_ctx_complete(ctx)) {
+			if (highest_pool_idx < TEST_BUFS)
+				ctx = &ctxpool[highest_pool_idx++];
+			else
+				ctx = sm3_ctx_mgr_flush(mgr);
+			continue;
+		} else if (u->processed >= (LEN_TOTAL - UPDATE_SIZE)) {
+			len = (LEN_TOTAL - u->processed);
+			update_type = HASH_LAST;
+		}
+		u->processed += len;
+		ctx = sm3_ctx_mgr_submit(mgr, ctx, bufs[idx], len, update_type);
+
+		if (NULL == ctx) {
+			if (highest_pool_idx < TEST_BUFS)
+				ctx = &ctxpool[highest_pool_idx++];
+			else
+				ctx = sm3_ctx_mgr_flush(mgr);
+		}
+	}
+
+	printf("multibuffer SM3 digest: \n");
+	for (i = 0; i < TEST_BUFS; i++) {
+		printf("Total processing size of buf[%d] is %ld \n", i,
+		       ctxpool[i].total_length);
+		for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+			printf("digest%d : %08X\n", j, ctxpool[i].job.result_digest[j]);
+		}
+	}
+	printf("\n");
+
+	printf("openssl SM3 update digest: \n");
+	for (i = 0; i < SM3_DIGEST_NWORDS; i++)
+		printf("%08X - ", to_le32(((uint32_t *) digest_ref_upd)[i]));
+	printf("\n");
+
+	for (i = 0; i < TEST_BUFS; i++) {
+		for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+			if (ctxpool[i].job.result_digest[j] !=
+			    to_le32(((uint32_t *) digest_ref_upd)[j])) {
+				fail++;
+			}
+		}
+	}
+
+	if (fail)
+		printf("Test failed SM3_hash_large check %d\n", fail);
+	else
+		printf(" SM3_hash_large_test: Pass\n");
+	return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/check_format.sh b/src/crypto/isa-l/isa-l_crypto/tools/check_format.sh
new file mode 100755
index 000000000..8c67a931c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/check_format.sh
@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+
+set -e
+rc=0
+verbose=0
+indent_args='-linux -l95 -cp1 -lps -il6 -ncs'
+function iver { printf "%03d%03d%03d%03d" $(echo "$@" | sed 's/GNU indent//' | tr '.' ' '); }
+
+while [ -n "$*" ]; do
+    case "$1" in
+	-v )
+	    verbose=1
+	    shift
+	    ;;
+	-h )
+	    echo check_format.sh [-h -v]
+	    exit 0
+	    ;;
+    esac
+done
+
+echo "Checking format of files in the git index at $PWD"
+if ! git rev-parse --is-inside-work-tree >& /dev/null; then
+    echo "Not in a git repo: Fail"
+    exit 1
+fi
+
+if hash indent && [ $(iver $(indent --version)) -ge $(iver 2.2.12) ]; then
+    echo "Checking C files for coding style..."
+    for f in `git ls-files '*.c'`; do
+	[ "$verbose" -gt 0 ] && echo "checking style on $f"
+	if ! indent $indent_args -st $f | diff -q $f - >& /dev/null; then
+	    echo "  File found with formatting issues: $f"
+	    [ "$verbose" -gt 0 ] 2> /dev/null && indent $indent_args -st $f | diff -u $f -
+	    rc=1
+	fi
+    done
+    [ "$rc" -gt 0 ] && echo "  Run ./tools/iindent on files"
+else
+	echo "You do not have a recent indent installed so your code style is not being checked!"
+fi
+
+if hash grep; then
+    echo "Checking for dos and whitespace violations..."
+    for f in $(git ls-files); do
+	[ "$verbose" -gt 0 ] && echo "checking whitespace on $f"
+	if grep -q '[[:space:]]$' $f ; then
+	    echo "  File found with trailing whitespace: $f"
+	    rc=1
+	fi
+	if grep -q $'\r' $f ; then
+	    echo "  File found with dos formatting: $f"
+	    rc=1
+	fi
+    done
+fi
+
+echo "Checking source files for permissions..."
+while read -r perm _res0 _res1 f; do
+    [ -z "$f" ] && continue
+    [ "$verbose" -gt 0 ] && echo "checking permissions on $f"
+    if [ "$perm" -ne 100644 ]; then
+	echo "  File found with permissions issue ($perm): $f"
+	rc=1
+    fi
+done <<< $(git ls-files -s -- ':(exclude)*.sh' ':(exclude)*iindent')
+
+echo "Checking script files for permissions..."
+while read -r perm _res0 _res1 f; do
+    [ -z "$f" ] && continue
+    [ "$verbose" -gt 0 ] && echo "checking permissions on $f"
+    if [ "$perm" -ne 100755 ]; then
+	echo "  Script found with permissions issue ($perm): $f"
+	rc=1
+    fi
+done <<< $(git ls-files -s '*.sh')
+
+
+echo "Checking for signoff in commit message..."
+if ! git log -n 1 --format=%B | grep -q "^Signed-off-by:" ; then
+    echo "  Commit not signed off. Please read src/CONTRIBUTING.md"
+    rc=1
+fi
+
+[ "$rc" -gt 0 ] && echo Format Fail || echo Format Pass
+
+exit $rc
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/gen_nmake.mk b/src/crypto/isa-l/isa-l_crypto/tools/gen_nmake.mk
new file mode 100644
index 000000000..f2c8b46ed
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/gen_nmake.mk
@@ -0,0 +1,123 @@
+# Regenerate nmake file from makefiles or check its consistency
+
+test_nmake_file: tst.nmake
+	@diff -u Makefile.nmake tst.nmake || (echo Potential nmake consistency issue; $(RM) tst.nmake; false;)
+	@echo No nmake consistency issues
+	@$(RM) tst.nmake
+
+FORCE:
+Makefile.nmake tst.nmake: FORCE
+	@echo Regenerating $@
+	@echo '########################################################################' > $@
+	@cat LICENSE | sed -e 's/^/#/ ' >> $@
+	@echo '########################################################################' >> $@
+	@echo ''			>> $@
+	@echo '# This file can be auto-regenerated with $$make -f Makefile.unx Makefile.nmake' >> $@
+	@echo ''			>> $@
+	@echo -n 'objs =' >> $@
+	@$(foreach o, $(subst /,\\,$(objs:.o=.obj)), printf " %s\n\t%s" \\ $(o) >> $@; )
+	@echo ''			>> $@
+	@echo ''			>> $@
+	@echo 'INCLUDES  = $(INCLUDE)'	>> $@
+	@echo '# Modern asm feature level, consider upgrading nasm/yasm before decreasing feature_level'	>> $@
+	@echo 'FEAT_FLAGS = -DHAVE_AS_KNOWS_AVX512 -DAS_FEATURE_LEVEL=10 -DHAVE_AS_KNOWS_SHANI'	>> $@
+	@echo 'CFLAGS_REL = -O2 -DNDEBUG /Z7 /MD /Gy'		>> $@
+	@echo 'CFLAGS_DBG = -Od -DDEBUG /Z7 /MDd'			>> $@
+	@echo 'LINKFLAGS  = -nologo -incremental:no -debug'	>> $@
+	@echo 'CFLAGS     = $$(CFLAGS_REL) -nologo -D_USE_MATH_DEFINES $$(FEAT_FLAGS) $$(INCLUDES) $$(D)'	>> $@
+	@echo 'AFLAGS     = -f win64 $$(FEAT_FLAGS) $$(INCLUDES) $$(D)'	>> $@
+	@echo 'CC         = cl'			>> $@
+	@echo '# or CC    = icl -Qstd=c99'	>> $@
+	@echo 'AS         = nasm'		>> $@
+	@echo ''			>> $@
+	@echo 'lib: bin static dll'	>> $@
+	@echo 'static: bin isa-l_crypto_static.lib'	>> $@
+	@echo 'dll: bin isa-l_crypto.dll'	>> $@
+	@echo ''			>> $@
+	@echo 'bin: ; -mkdir $$@'	>> $@
+	@echo ''			>> $@
+	@echo 'isa-l_crypto_static.lib: $$(objs)'	>> $@
+	@echo '	lib -out:$$@ @<<'	>> $@
+	@echo '$$?'			>> $@
+	@echo '<<'			>> $@
+	@echo ''			>> $@
+	@echo 'isa-l_crypto.dll: $$(objs)'	>> $@
+	@echo '	link -out:$$@ -dll -def:isa-l_crypto.def $$(LINKFLAGS) @<<'	>> $@
+	@echo '$$?'			>> $@
+	@echo '<<'			>> $@
+	@echo ''			>> $@
+	@$(foreach b, $(units), \
+		printf "{%s}.c.obj:\n\t\$$(CC) \$$(CFLAGS) /c -Fo\$$@ \$$?\n{%s}.asm.obj:\n\t\$$(AS) \$$(AFLAGS) -o \$$@ \$$?\n\n" $(b) $(b) >> $@; )
+	@echo ''			>> $@
+ifneq (,$(examples))
+	@echo "# Examples"	>> $@
+	@echo -n 'ex =' >> $@
+	@$(foreach ex, $(notdir $(examples)), printf " %s\n\t%s.exe" \\ $(ex) >> $@; )
+	@echo ''			>> $@
+	@echo ''			>> $@
+	@echo 'ex: lib $$(ex)'		>> $@
+	@echo ''			>> $@
+	@echo '$$(ex): $$(@B).obj'	>> $@
+endif
+	@echo ''			>> $@
+	@echo '.obj.exe:'		>> $@
+	@echo '	link /out:$$@ $$(LINKFLAGS) isa-l_crypto.lib $$?'	>> $@
+	@echo ''			>> $@
+	@echo '# Check tests'		>> $@
+	@echo -n 'checks =' 		>> $@
+	@$(foreach check, $(notdir $(check_tests)), printf " %s\n\t%s.exe" \\ $(check) >> $@; )
+	@echo ''			>> $@
+	@echo ''			>> $@
+	@echo 'checks: lib $$(checks)'	>> $@
+	@echo '$$(checks): $$(@B).obj'	>> $@
+	@echo 'check: $$(checks)'	>> $@
+	@echo '	!$$?'			>> $@
+	@echo ''			>> $@
+	@echo '# Unit tests'		>> $@
+	@echo -n 'tests =' 		>> $@
+	@$(foreach test, $(notdir $(unit_tests)), printf " %s\n\t%s.exe" \\ $(test) >> $@; )
+	@echo ''			>> $@
+	@echo ''			>> $@
+	@echo 'tests: lib $$(tests)'	>> $@
+	@echo '$$(tests): $$(@B).obj'	>> $@
+	@echo ''			>> $@
+	@echo '# Performance tests'	>> $@
+	@echo -n 'perfs =' 		>> $@
+	@$(foreach perf, $(notdir $(perf_tests)), printf " %s\n\t%s.exe" \\ $(perf) >> $@; )
+	@echo ''			>> $@
+	@echo ''			>> $@
+	@echo 'perfs: lib $$(perfs)'	>> $@
+	@echo '$$(perfs): $$(@B).obj'	>> $@
+	@echo ''			>> $@
+	@echo -n 'progs ='		>> $@
+	@$(foreach prog, $(notdir $(bin_PROGRAMS)), printf " %s\n\t%s.exe" \\ $(prog) >> $@; )
+	@echo ''			>> $@
+	@echo ''			>> $@
+	@echo 'progs: lib $$(progs)'	>> $@
+	@$(foreach p, $(notdir $(bin_PROGRAMS)), \
+		printf "%s.exe: %s\n\tlink /out:\$$@ \$$(LINKFLAGS) isa-l_crypto.lib \$$?\n" $(p) $(subst /,\\,$(programs_$(p)_SOURCES:.c=.obj)) >> $@; )
+	@echo ''			>> $@
+	@echo 'clean:'					>> $@
+	@echo '	-if exist *.obj del *.obj'		>> $@
+	@echo '	-if exist bin\*.obj del bin\*.obj'	>> $@
+	@echo '	-if exist isa-l_crypto_static.lib del isa-l_crypto_static.lib'	>> $@
+	@echo '	-if exist *.exe del *.exe'		>> $@
+	@echo '	-if exist *.pdb del *.pdb'		>> $@
+	@echo '	-if exist isa-l_crypto.lib del isa-l_crypto.lib'	>> $@
+	@echo '	-if exist isa-l_crypto.dll del isa-l_crypto.dll'	>> $@
+	@echo '	-if exist isa-l_crypto.exp del isa-l_crypto.exp'	>> $@
+	@echo ''		>> $@
+	@echo 'libcrypto.lib:' 	>> $@
+	@cat $(foreach unit,$(units), $(unit)/Makefile.am)  | sed  \
+		-e '/: /!d' \
+		-e 's/\([^ :]*\)[ ]*/\1.exe /g' \
+		-e :c -e 's/:\(.*\).exe/:\1/;tc' \
+		-e 's/\.o[ $$]/.obj /g' \
+		-e 's/\.o\.exe[ ]:/.obj:/g' \
+		-e '/CFLAGS_.*+=/d' \
+		-e '/:.*\%.*:/d' \
+		-e 's/ :/:/' \
+		-e 's/LDLIBS *+=//' \
+		-e 's/-lcrypto/libcrypto.lib/' \
+		-e 's/ $$//' \
+			>> $@
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/iindent b/src/crypto/isa-l/isa-l_crypto/tools/iindent
new file mode 100755
index 000000000..48d26360f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/iindent
@@ -0,0 +1,2 @@
+#!/bin/sh
+indent -linux -l95 -cp1 -lps -il6 -ncs "$@"
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/nasm-cet-filter.sh b/src/crypto/isa-l/isa-l_crypto/tools/nasm-cet-filter.sh
new file mode 100755
index 000000000..19e03856c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/nasm-cet-filter.sh
@@ -0,0 +1,56 @@
+#/bin/sh
+
+# Filter out unnecessary options added by automake
+
+while [ -n "$*" ]; do
+    case "$1" in
+	-o )
+	    # Supported options with arg
+	    options="$options $1 $2"
+	    shift
+	    object="$1"
+	    shift
+	    ;;
+	-f | -D )
+	    # Supported options with arg
+	    options="$options $1 $2"
+	    shift
+	    shift
+	    ;;
+	-I | -i )
+	    options="$options $1 $2/"
+	    shift
+	    shift
+	    ;;
+	--prefix* )
+	    # Supported options without arg
+	    options="$options $1"
+	    shift
+	    ;;
+	-I* | -i* )
+	    options="$options $1/"
+	    shift
+	    ;;
+	-D* ) # For defines we need to remove spaces
+	    case "$1" in
+		*' '* ) ;;
+		*) options="$options $1" ;;
+	    esac
+	    shift
+	    ;;
+	#-blah )
+	# Unsupported options with args - none known
+	-* )
+	    # Unsupported options with no args
+	    shift
+	    ;;
+	* )
+	    args="$args $1"
+	    shift
+	    ;;
+    esac
+done
+
+nasm $options $args
+$CET_LD -r -z ibt -z shstk -o $object.tmp $object
+mv $object.tmp $object
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/nasm-filter.sh b/src/crypto/isa-l/isa-l_crypto/tools/nasm-filter.sh
new file mode 100755
index 000000000..5ec9ba3f3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/nasm-filter.sh
@@ -0,0 +1,47 @@
+#/bin/sh
+
+# Filter out unnecessary options added by automake
+
+while [ -n "$*" ]; do
+    case "$1" in
+	-f | -o | -D )
+	    # Supported options with arg
+	    options="$options $1 $2"
+	    shift
+	    shift
+	    ;;
+	-I | -i )
+	    options="$options $1 $2/"
+	    shift
+	    shift
+	    ;;
+	--prefix* )
+	    # Supported options without arg
+	    options="$options $1"
+	    shift
+	    ;;
+	-I* | -i* )
+	    options="$options $1/"
+	    shift
+	    ;;
+	-D* ) # For defines we need to remove spaces
+	    case "$1" in
+		*' '* ) ;;
+		*) options="$options $1" ;;
+	    esac
+	    shift
+	    ;;
+	#-blah )
+	# Unsupported options with args - none known
+	-* )
+	    # Unsupported options with no args
+	    shift
+	    ;;
+	* )
+	    args="$args $1"
+	    shift
+	    ;;
+    esac
+done
+
+nasm $options $args
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/remove_trailing_whitespace.sh b/src/crypto/isa-l/isa-l_crypto/tools/remove_trailing_whitespace.sh
new file mode 100755
index 000000000..bb82b9fa5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/remove_trailing_whitespace.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+sed -i -i.bak 's/[[:blank:]]*$//' "$@"
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/test_autorun.sh b/src/crypto/isa-l/isa-l_crypto/tools/test_autorun.sh
new file mode 100755
index 000000000..756e1e069
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/test_autorun.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+
+set -e #exit on fail
+
+# Override defaults if exist
+READLINK=readlink
+command -V greadlink >/dev/null 2>&1 && READLINK=greadlink
+
+
+out="$PWD"
+src=$($READLINK -f $(dirname $0))/..
+cd "$src"
+
+# Echo environment info
+if test -d .git; then
+    branch=$(git describe)
+    commitid=$(git rev-parse HEAD)
+    brief=$(git log -1 --format='%s')
+    branch_changes=$(git diff --shortstat)
+fi
+if command -V uname >/dev/null 2>&1; then
+    node=$(uname -n)
+    os_name=$(uname -s)
+    os_all=$(uname -a)
+fi
+
+echo "Test report v1"
+echo "branch:              $branch"
+echo "brief:               $brief"
+echo "commitid:            $commitid"
+echo "node:                $node"
+echo "os_name:             $os_name"
+echo "os_all:              $os_all"
+echo "test_args:           $@"
+echo "changes:             $branch_changes"
+command -V lscpu > /dev/null 2>&1 && lscpu
+
+# Start tests
+
+[ -z "$1" ] && ./tools/test_checks.sh
+
+while [ -n "$1" ]; do
+    case "$1" in
+	check )
+	    ./tools/test_checks.sh
+	    shift ;;
+	ext )
+	    ./tools/test_extended.sh
+	    shift ;;
+	format )
+	    shift ;;
+	all )
+	    ./tools/test_checks.sh
+	    ./tools/test_extended.sh
+	    shift ;;
+	* )
+	    echo $0 undefined option: $1
+	    shift ;;
+    esac
+done
+
+./tools/check_format.sh
+
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/test_checks.sh b/src/crypto/isa-l/isa-l_crypto/tools/test_checks.sh
new file mode 100755
index 000000000..9573554db
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/test_checks.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+
+set -xe #exit on fail
+
+# Defaults
+cpus=1
+S=$RANDOM
+MAKE=make
+READLINK=readlink
+
+# Override defaults if exist
+command -V gmake >/dev/null 2>&1 && MAKE=gmake
+command -V greadlink >/dev/null 2>&1 && READLINK=greadlink
+
+out="$PWD"
+src=$($READLINK -f $(dirname $0))/..
+source $src/tools/test_tools.sh
+cd "$src"
+tmp_install_dir=$out/tmp_install
+
+# Run on mult cpus
+if command -V lscpu >/dev/null 2>&1; then
+    cpus=`lscpu -p | tail -1 | cut -d, -f 2`
+    cpus=$(($cpus + 1))
+elif command -V sysctl; then
+    if sysctl -n hw.ncpu >/dev/null 2>&1; then
+	cpus=$(sysctl -n hw.ncpu)
+	cpus=$(($cpus + 1))
+    fi
+fi
+echo "Using $cpus cpu threads"
+
+# Pick a random test seed
+if [ -z "$S" ]; then
+    S=`tr -cd 0-9 </dev/urandom | head -c 4 | sed -e 's/^0*/1/g'`
+    [ "$S" -gt 0 ] 2> /dev/null || S="123"
+fi
+echo "Running with TEST_SEED=$S"
+
+# Fix Darwin issues
+if uname | grep -q 'Darwin' 2>&1; then
+    export SED=`which sed`
+    opt_config_target='--target=darwin'
+fi
+
+# Tests
+time ./autogen.sh
+time ./configure --prefix=$tmp_install_dir $opt_config_target
+time $MAKE -j $cpus
+test_start "check_tests"
+time $MAKE check -j $cpus D="-D TEST_SEED=$S"
+test_end "check_tests" $?
+test_start "installation_test"
+time $MAKE install
+test_end "installation_test" $?
+
+# Check for gnu executable stack set
+if command -V readelf >/dev/null 2>&1; then
+    if readelf -W -l $tmp_install_dir/lib/libisal_crypto.so | grep 'GNU_STACK' | grep -q 'RWE'; then
+	echo Stack NX check $tmp_install_dir/lib/libisal_crypto.so Fail
+	exit 1
+    else
+	echo Stack NX check $tmp_install_dir/lib/libisal_crypto.so Pass
+    fi
+else
+    echo Stack NX check not supported
+fi
+
+$MAKE clean
+
+
+
+echo $0: Pass
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/test_extended.sh b/src/crypto/isa-l/isa-l_crypto/tools/test_extended.sh
new file mode 100755
index 000000000..b79cbb0c1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/test_extended.sh
@@ -0,0 +1,127 @@
+#!/usr/bin/env bash
+
+# Extended tests: Run a few more options other than make check
+
+set -xe #exit on fail
+
+# Defaults
+cpus=1
+S=$RANDOM
+MAKE=make
+READLINK=readlink
+test_level=check
+build_opt=''
+msg=''
+
+# Override defaults if exist
+command -V gmake >/dev/null 2>&1 && MAKE=gmake
+command -V greadlink >/dev/null 2>&1 && READLINK=greadlink
+[ -n "$CC" ] && build_opt+="CC=$CC "
+[ -n "$AS" ] && build_opt+="AS=$AS "
+
+out="$PWD"
+src=$($READLINK -f $(dirname $0))/..
+source $src/tools/test_tools.sh
+cd "$src"
+
+# Run on mult cpus
+if command -V lscpu >/dev/null 2>&1; then
+    cpus=`lscpu -p | tail -1 | cut -d, -f 2`
+    cpus=$(($cpus + 1))
+elif command -V sysctl; then
+    if sysctl -n hw.ncpu >/dev/null 2>&1; then
+	cpus=$(sysctl -n hw.ncpu)
+	cpus=$(($cpus + 1))
+    fi
+fi
+echo "Using $cpus cpu threads"
+
+if [ -z "$S" ]; then
+    S=`tr -cd 0-9 </dev/urandom | head -c 4 | sed -e 's/^0*/1/g'`
+    [ "$S" -gt 0 ] 2> /dev/null || S="123"
+fi
+msg+="Running with TEST_SEED=$S".$'\n'
+
+# Fix Darwin issues
+if uname | grep -q 'Darwin' 2>&1; then
+    export SED=`which sed`
+fi
+
+# Check for test libs to add
+if command -V ldconfig >/dev/null 2>&1; then
+    if ldconfig -p | grep -q libcrypto.so; then
+	test_level=test
+	msg+=$'With extra tests\n'
+    fi
+    if ldconfig -p | grep -q libefence.so; then
+	build_opt+="LDFLAGS+='-lefence' "
+	msg+=$'With efence\n'
+    fi
+fi
+
+# Std makefile build test
+$MAKE -f Makefile.unx clean
+test_start "extended_build_test"
+time $MAKE -f Makefile.unx -j $cpus $build_opt
+test_end "extended_build_test" $?
+msg+=$'Std makefile build: Pass\n'
+
+# Check for gnu executable stack set
+if command -V readelf >/dev/null 2>&1; then
+    test_start "stack_nx_check"
+    if readelf -W -l bin/libisal_crypto.so | grep 'GNU_STACK' | grep -q 'RWE'; then
+	echo $0: Stack NX check bin/libisal_crypto.so: Fail
+	test_end "stack_nx_check" 1
+	exit 1
+    else
+	test_end "stack_nx_check" 0
+	msg+=$'Stack NX check bin/lib/libisal_crypto.so: Pass\n'
+    fi
+else
+    msg+=$'Stack NX check not supported: Skip\n'
+fi
+
+# Std makefile build perf tests
+test_start "extended_perf_test"
+time $MAKE -f Makefile.unx -j $cpus perfs
+test_end "extended_perf_test" $?
+msg+=$'Std makefile build perf: Pass\n'
+
+# Std makefile run tests
+test_start "extended_makefile_tests"
+time $MAKE -f Makefile.unx -j $cpus $build_opt D="TEST_SEED=$S" $test_level
+test_end "extended_makefile_tests" $?
+msg+=$'Std makefile tests: Pass\n'
+
+# Std makefile build other
+test_start "extended_other_tests"
+time $MAKE -f Makefile.unx -j $cpus $build_opt D="TEST_SEED=$S" other
+test_end "extended_other_tests" $?
+msg+=$'Other tests build: Pass\n'
+
+$MAKE -f Makefile.unx clean
+
+# Std makefile run tests with NT_LDST
+test_start "extended_makefile_tests with NT_LDST"
+$MAKE -f Makefile.unx -j $cpus $build_opt D="TEST_SEED=$S NT_LDST"
+time $MAKE -f Makefile.unx -j $cpus $build_opt D="TEST_SEED=$S NT_LDST" $test_level
+test_end "extended_makefile_tests with NT_LDST" $?
+msg+=$'Std makefile tests: Pass\n'
+
+$MAKE -f Makefile.unx clean
+
+# noarch makefile run tests
+test_start "extended_makefile_tests"
+time $MAKE -f Makefile.unx -j $cpus $build_opt D="TEST_SEED=$S" \
+	arch=noarch
+time $MAKE -f Makefile.unx -j $cpus $build_opt D="TEST_SEED=$S" \
+	arch=noarch $test_level
+test_end "extended_makefile_tests" $?
+msg+=$'noarch makefile tests: Pass\n'
+
+set +x
+echo
+echo "Summary test $0:"
+echo "Build opt: $build_opt"
+echo "$msg"
+echo "$0: Final: Pass"
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/test_tools.sh b/src/crypto/isa-l/isa-l_crypto/tools/test_tools.sh
new file mode 100755
index 000000000..448b1f92b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/test_tools.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+function test_start()
+{
+        echo "entering test: $1"
+}
+
+function test_end()
+{
+        echo "leaving test: $1 status: $2"
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/yasm-cet-filter.sh b/src/crypto/isa-l/isa-l_crypto/tools/yasm-cet-filter.sh
new file mode 100755
index 000000000..d7b3e973d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/yasm-cet-filter.sh
@@ -0,0 +1,47 @@
+#/bin/sh
+
+# Filter out unnecessary options added by automake
+
+while [ -n "$*" ]; do
+    case "$1" in
+	-o )
+	    # Supported options with arg
+	    options="$options $1 $2"
+	    shift
+	    object="$1"
+	    shift
+	    ;;
+	-f | -I | -i | -D )
+	    # Supported options with arg
+	    options="$options $1 $2"
+	    shift
+	    shift
+	    ;;
+	-I* | -i* | --prefix* )
+	    # Supported options without arg
+	    options="$options $1"
+	    shift
+	    ;;
+	-D* ) # For defines we need to remove spaces
+	    case "$1" in
+		*' '* ) ;;
+		*) options="$options $1" ;;
+	    esac
+	    shift
+	    ;;
+	#-blah )
+	# Unsupported options with args - none known
+	-* )
+	    # Unsupported options with no args
+	    shift
+	    ;;
+	* )
+	    args="$args $1"
+	    shift
+	    ;;
+    esac
+done
+
+yasm $options $args
+$CET_LD -r -z ibt -z shstk -o $object.tmp $object
+mv $object.tmp $object
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/yasm-filter.sh b/src/crypto/isa-l/isa-l_crypto/tools/yasm-filter.sh
new file mode 100755
index 000000000..c33952a40
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/yasm-filter.sh
@@ -0,0 +1,38 @@
+#/bin/sh
+
+# Filter out unnecessary options added by automake
+
+while [ -n "$*" ]; do
+    case "$1" in
+	-f | -o | -I | -i | -D )
+	    # Supported options with arg
+	    options="$options $1 $2"
+	    shift
+	    shift
+	    ;;
+	-I* | -i* | --prefix* )
+	    # Supported options without arg
+	    options="$options $1"
+	    shift
+	    ;;
+	-D* ) # For defines we need to remove spaces
+	    case "$1" in
+		*' '* ) ;;
+		*) options="$options $1" ;;
+	    esac
+	    shift
+	    ;;
+	#-blah )
+	# Unsupported options with args - none known
+	-* )
+	    # Unsupported options with no args
+	    shift
+	    ;;
+	* )
+	    args="$args $1"
+	    shift
+	    ;;
+    esac
+done
+
+yasm $options $args
diff --git a/src/crypto/isa-l/isal_crypto_accel.cc b/src/crypto/isa-l/isal_crypto_accel.cc
new file mode 100644
index 000000000..7dccf64fd
--- /dev/null
+++ b/src/crypto/isa-l/isal_crypto_accel.cc
@@ -0,0 +1,43 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Mirantis, Inc.
+ *
+ * Author: Adam Kupczyk <akupczyk@mirantis.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "crypto/isa-l/isal_crypto_accel.h"
+
+#include "crypto/isa-l/isa-l_crypto/include/aes_cbc.h"
+
+bool ISALCryptoAccel::cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+                             const unsigned char (&iv)[AES_256_IVSIZE],
+                             const unsigned char (&key)[AES_256_KEYSIZE])
+{
+  if ((size % AES_256_IVSIZE) != 0) {
+    return false;
+  }
+  alignas(16) struct cbc_key_data keys_blk;
+  aes_cbc_precomp(const_cast<unsigned char*>(&key[0]), AES_256_KEYSIZE, &keys_blk);
+  aes_cbc_enc_256(const_cast<unsigned char*>(in),
+                  const_cast<unsigned char*>(&iv[0]), keys_blk.enc_keys, out, size);
+  return true;
+}
+bool ISALCryptoAccel::cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+                             const unsigned char (&iv)[AES_256_IVSIZE],
+                             const unsigned char (&key)[AES_256_KEYSIZE])
+{
+  if ((size % AES_256_IVSIZE) != 0) {
+    return false;
+  }
+  alignas(16) struct cbc_key_data keys_blk;
+  aes_cbc_precomp(const_cast<unsigned char*>(&key[0]), AES_256_KEYSIZE, &keys_blk);
+  aes_cbc_dec_256(const_cast<unsigned char*>(in), const_cast<unsigned char*>(&iv[0]), keys_blk.dec_keys, out, size);
+  return true;
+}
diff --git a/src/crypto/isa-l/isal_crypto_accel.h b/src/crypto/isa-l/isal_crypto_accel.h
new file mode 100644
index 000000000..84331bbdd
--- /dev/null
+++ b/src/crypto/isa-l/isal_crypto_accel.h
@@ -0,0 +1,31 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Mirantis, Inc.
+ *
+ * Author: Adam Kupczyk <akupczyk@mirantis.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef ISAL_CRYPTO_ACCEL_H
+#define ISAL_CRYPTO_ACCEL_H
+#include "crypto/crypto_accel.h"
+
+class ISALCryptoAccel : public CryptoAccel {
+ public:
+  ISALCryptoAccel() {}
+  virtual ~ISALCryptoAccel() {}
+
+  bool cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+                   const unsigned char (&iv)[AES_256_IVSIZE],
+                   const unsigned char (&key)[AES_256_KEYSIZE]) override;
+  bool cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+                   const unsigned char (&iv)[AES_256_IVSIZE],
+                   const unsigned char (&key)[AES_256_KEYSIZE]) override;
+};
+#endif
diff --git a/src/crypto/isa-l/isal_crypto_plugin.cc b/src/crypto/isa-l/isal_crypto_plugin.cc
new file mode 100644
index 000000000..85f0e5f0f
--- /dev/null
+++ b/src/crypto/isa-l/isal_crypto_plugin.cc
@@ -0,0 +1,34 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Mirantis, Inc.
+ *
+ * Author: Adam Kupczyk <akupczykd@mirantis.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+
+// -----------------------------------------------------------------------------
+#include "crypto/isa-l/isal_crypto_plugin.h"
+
+#include "ceph_ver.h"
+// -----------------------------------------------------------------------------
+
+const char *__ceph_plugin_version()
+{
+  return CEPH_GIT_NICE_VER;
+}
+
+int __ceph_plugin_init(CephContext *cct,
+                       const std::string& type,
+                       const std::string& name)
+{
+  auto instance = cct->get_plugin_registry();
+
+  return instance->add(type, name, new ISALCryptoPlugin(cct));
+}
diff --git a/src/crypto/isa-l/isal_crypto_plugin.h b/src/crypto/isa-l/isal_crypto_plugin.h
new file mode 100644
index 000000000..68e782e69
--- /dev/null
+++ b/src/crypto/isa-l/isal_crypto_plugin.h
@@ -0,0 +1,47 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Mirantis, Inc.
+ *
+ * Author: Adam Kupczyk <akupczyk@mirantis.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef ISAL_CRYPTO_PLUGIN_H
+#define ISAL_CRYPTO_PLUGIN_H
+// -----------------------------------------------------------------------------
+#include "crypto/crypto_plugin.h"
+#include "crypto/isa-l/isal_crypto_accel.h"
+#include "arch/intel.h"
+#include "arch/probe.h"
+// -----------------------------------------------------------------------------
+
+
+class ISALCryptoPlugin : public CryptoPlugin {
+
+public:
+
+  explicit ISALCryptoPlugin(CephContext* cct) : CryptoPlugin(cct)
+  {}
+  ~ISALCryptoPlugin()
+  {}
+  virtual int factory(CryptoAccelRef *cs,
+                      std::ostream *ss)
+  {
+    if (cryptoaccel == nullptr)
+    {
+      ceph_arch_probe();
+      if (ceph_arch_intel_aesni && ceph_arch_intel_sse41) {
+        cryptoaccel = CryptoAccelRef(new ISALCryptoAccel);
+      }
+    }
+    *cs = cryptoaccel;
+    return 0;
+  }
+};
+#endif
diff --git a/src/crypto/openssl/CMakeLists.txt b/src/crypto/openssl/CMakeLists.txt
new file mode 100644
index 000000000..6ede1567f
--- /dev/null
+++ b/src/crypto/openssl/CMakeLists.txt
@@ -0,0 +1,14 @@
+## openssl
+
+set(openssl_crypto_plugin_srcs
+  openssl_crypto_accel.cc
+  openssl_crypto_plugin.cc)
+
+add_library(ceph_crypto_openssl SHARED ${openssl_crypto_plugin_srcs})
+target_link_libraries(ceph_crypto_openssl
+    PRIVATE OpenSSL::Crypto
+    $<$<PLATFORM_ID:Windows>:ceph-common>)
+target_include_directories(ceph_crypto_openssl PRIVATE ${OPENSSL_INCLUDE_DIR})
+add_dependencies(crypto_plugins ceph_crypto_openssl)
+set_target_properties(ceph_crypto_openssl PROPERTIES INSTALL_RPATH "")
+install(TARGETS ceph_crypto_openssl DESTINATION ${crypto_plugin_dir})
diff --git a/src/crypto/openssl/openssl_crypto_accel.cc b/src/crypto/openssl/openssl_crypto_accel.cc
new file mode 100644
index 000000000..e6ea0fa72
--- /dev/null
+++ b/src/crypto/openssl/openssl_crypto_accel.cc
@@ -0,0 +1,104 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "crypto/openssl/openssl_crypto_accel.h"
+#include <openssl/evp.h>
+#include <openssl/engine.h>
+#include "common/debug.h"
+
+// -----------------------------------------------------------------------------
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_crypto
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+static std::ostream&
+_prefix(std::ostream* _dout)
+{
+  return *_dout << "OpensslCryptoAccel: ";
+}
+// -----------------------------------------------------------------------------
+
+#define EVP_SUCCESS 1
+#define AES_ENCRYPT 1
+#define AES_DECRYPT 0
+
+bool evp_transform(unsigned char* out, const unsigned char* in, size_t size,
+                   const unsigned char* iv,
+                   const unsigned char* key,
+                   ENGINE* engine,
+                   const EVP_CIPHER* const type,
+                   const int encrypt)
+{
+  using pctx_t = std::unique_ptr<EVP_CIPHER_CTX, decltype(&::EVP_CIPHER_CTX_free)>;
+  pctx_t pctx{ EVP_CIPHER_CTX_new(), EVP_CIPHER_CTX_free };
+
+  if (!pctx) {
+    derr << "failed to create evp cipher context" << dendl;
+    return false;
+  }
+
+  if (EVP_CipherInit_ex(pctx.get(), type, engine, key, iv, encrypt) != EVP_SUCCESS) {
+    derr << "EVP_CipherInit_ex failed" << dendl;
+    return false;
+  }
+
+  if (EVP_CIPHER_CTX_set_padding(pctx.get(), 0) != EVP_SUCCESS) {
+    derr << "failed to disable PKCS padding" << dendl;
+    return false;
+  }
+
+  int len_update = 0;
+  if (EVP_CipherUpdate(pctx.get(), out, &len_update, in, size) != EVP_SUCCESS) {
+    derr << "EVP_CipherUpdate failed" << dendl;
+    return false;
+  }
+  
+  int len_final = 0;
+  if (EVP_CipherFinal_ex(pctx.get(), out + len_update, &len_final) != EVP_SUCCESS) {
+    derr << "EVP_CipherFinal_ex failed" << dendl;
+    return false;
+  }
+
+  ceph_assert(len_final == 0);
+  return (len_update + len_final) == static_cast<int>(size);
+}
+                        
+bool OpenSSLCryptoAccel::cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+                             const unsigned char (&iv)[AES_256_IVSIZE],
+                             const unsigned char (&key)[AES_256_KEYSIZE])
+{
+  if ((size % AES_256_IVSIZE) != 0) {
+    return false;
+  }
+
+  return evp_transform(out, in, size, const_cast<unsigned char*>(&iv[0]),
+                       const_cast<unsigned char*>(&key[0]),
+                       nullptr, // Hardware acceleration engine can be used in the future
+                       EVP_aes_256_cbc(), AES_ENCRYPT);
+}
+                             
+bool OpenSSLCryptoAccel::cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+                             const unsigned char (&iv)[AES_256_IVSIZE],
+                             const unsigned char (&key)[AES_256_KEYSIZE])
+{
+  if ((size % AES_256_IVSIZE) != 0) {
+    return false;
+  }
+
+  return evp_transform(out, in, size, const_cast<unsigned char*>(&iv[0]),
+                       const_cast<unsigned char*>(&key[0]),
+                       nullptr, // Hardware acceleration engine can be used in the future
+                       EVP_aes_256_cbc(), AES_DECRYPT);
+}
diff --git a/src/crypto/openssl/openssl_crypto_accel.h b/src/crypto/openssl/openssl_crypto_accel.h
new file mode 100644
index 000000000..ad90cbece
--- /dev/null
+++ b/src/crypto/openssl/openssl_crypto_accel.h
@@ -0,0 +1,32 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef OPENSSL_CRYPTO_ACCEL_H
+#define OPENSSL_CRYPTO_ACCEL_H
+
+#include "crypto/crypto_accel.h"
+
+class OpenSSLCryptoAccel : public CryptoAccel {
+ public:
+  OpenSSLCryptoAccel() {}
+  virtual ~OpenSSLCryptoAccel() {}
+
+  bool cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+                   const unsigned char (&iv)[AES_256_IVSIZE],
+                   const unsigned char (&key)[AES_256_KEYSIZE]) override;
+  bool cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+                   const unsigned char (&iv)[AES_256_IVSIZE],
+                   const unsigned char (&key)[AES_256_KEYSIZE]) override;
+};
+#endif
diff --git a/src/crypto/openssl/openssl_crypto_plugin.cc b/src/crypto/openssl/openssl_crypto_plugin.cc
new file mode 100644
index 000000000..e6ecea2fd
--- /dev/null
+++ b/src/crypto/openssl/openssl_crypto_plugin.cc
@@ -0,0 +1,32 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+
+#include "crypto/openssl/openssl_crypto_plugin.h"
+
+#include "ceph_ver.h"
+
+const char *__ceph_plugin_version()
+{
+  return CEPH_GIT_NICE_VER;
+}
+
+int __ceph_plugin_init(CephContext *cct,
+                       const std::string& type,
+                       const std::string& name)
+{
+  auto instance = cct->get_plugin_registry();
+
+  return instance->add(type, name, new OpenSSLCryptoPlugin(cct));
+}
diff --git a/src/crypto/openssl/openssl_crypto_plugin.h b/src/crypto/openssl/openssl_crypto_plugin.h
new file mode 100644
index 000000000..408d9ebda
--- /dev/null
+++ b/src/crypto/openssl/openssl_crypto_plugin.h
@@ -0,0 +1,36 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef ISAL_CRYPTO_PLUGIN_H
+#define ISAL_CRYPTO_PLUGIN_H
+
+#include "crypto/crypto_plugin.h"
+#include "crypto/openssl/openssl_crypto_accel.h"
+
+
+class OpenSSLCryptoPlugin : public CryptoPlugin {
+
+  CryptoAccelRef cryptoaccel;
+public:
+  explicit OpenSSLCryptoPlugin(CephContext* cct) : CryptoPlugin(cct)
+  {}
+  int factory(CryptoAccelRef *cs, std::ostream *ss) override {
+    if (cryptoaccel == nullptr)
+      cryptoaccel = CryptoAccelRef(new OpenSSLCryptoAccel);
+
+    *cs = cryptoaccel;
+    return 0;
+  }
+};
+#endif
diff --git a/src/crypto/qat/CMakeLists.txt b/src/crypto/qat/CMakeLists.txt
new file mode 100644
index 000000000..fb751967a
--- /dev/null
+++ b/src/crypto/qat/CMakeLists.txt
@@ -0,0 +1,20 @@
+##
+# QAT wrapper for Ceph
+##
+
+set(qat_crypto_plugin_srcs
+  qat_crypto_accel.cc
+  qat_crypto_plugin.cc
+  qcccrypto.cc)
+
+add_library(ceph_crypto_qat SHARED ${qat_crypto_plugin_srcs})
+
+add_dependencies(crypto_plugins ceph_crypto_qat)
+
+target_link_libraries(ceph_crypto_qat PRIVATE
+                      QatDrv::qat_s
+                      QatDrv::usdm_drv_s)
+
+add_dependencies(crypto_plugins ceph_crypto_qat)
+set_target_properties(ceph_crypto_qat PROPERTIES VERSION 1.0.0 SOVERSION 1)
+install(TARGETS ceph_crypto_qat DESTINATION ${crypto_plugin_dir})
diff --git a/src/crypto/qat/qat_crypto_accel.cc b/src/crypto/qat/qat_crypto_accel.cc
new file mode 100644
index 000000000..23f86edfa
--- /dev/null
+++ b/src/crypto/qat/qat_crypto_accel.cc
@@ -0,0 +1,42 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ * Author: Ganesh Mahalingam <ganesh.mahalingam@intel.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "crypto/qat/qat_crypto_accel.h"
+
+bool QccCryptoAccel::cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+    const unsigned char (&iv)[AES_256_IVSIZE],
+    const unsigned char (&key)[AES_256_KEYSIZE])
+{
+  if ((size % AES_256_IVSIZE) != 0) {
+    return false;
+  }
+
+  return qcccrypto.perform_op(out, in, size,
+      const_cast<unsigned char *>(&iv[0]),
+      const_cast<unsigned char *>(&key[0]), CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT);
+}
+
+bool QccCryptoAccel::cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+    const unsigned char (&iv)[AES_256_IVSIZE],
+    const unsigned char (&key)[AES_256_KEYSIZE])
+{
+  if ((size % AES_256_IVSIZE) != 0) {
+    return false;
+  }
+
+  return qcccrypto.perform_op(out, in, size,
+      const_cast<unsigned char *>(&iv[0]),
+      const_cast<unsigned char *>(&key[0]), CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT);
+}
diff --git a/src/crypto/qat/qat_crypto_accel.h b/src/crypto/qat/qat_crypto_accel.h
new file mode 100644
index 000000000..5badefc28
--- /dev/null
+++ b/src/crypto/qat/qat_crypto_accel.h
@@ -0,0 +1,35 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ * Author: Ganesh Mahalingam <ganesh.mahalingam@intel.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef QAT_CRYPTO_ACCEL_H
+#define QAT_CRYPTO_ACCEL_H
+
+#include "crypto/crypto_accel.h"
+#include "crypto/qat/qcccrypto.h"
+
+class QccCryptoAccel : public CryptoAccel {
+  public:
+    QccCrypto qcccrypto;
+    QccCryptoAccel() { qcccrypto.init(); };
+    ~QccCryptoAccel() { qcccrypto.destroy(); };
+
+    bool cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+        const unsigned char (&iv)[AES_256_IVSIZE],
+        const unsigned char (&key)[AES_256_KEYSIZE]) override;
+    bool cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+        const unsigned char (&iv)[AES_256_IVSIZE],
+        const unsigned char (&key)[AES_256_KEYSIZE]) override;
+};
+#endif
diff --git a/src/crypto/qat/qat_crypto_plugin.cc b/src/crypto/qat/qat_crypto_plugin.cc
new file mode 100644
index 000000000..4bf3d61bb
--- /dev/null
+++ b/src/crypto/qat/qat_crypto_plugin.cc
@@ -0,0 +1,35 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ * Author: Ganesh Mahalingam <ganesh.mahalingam@intel.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+
+#include "crypto/qat/qat_crypto_plugin.h"
+
+#include "ceph_ver.h"
+
+std::mutex QccCryptoPlugin::qat_init;
+
+const char *__ceph_plugin_version()
+{
+  return CEPH_GIT_NICE_VER;
+}
+
+int __ceph_plugin_init(CephContext *cct,
+    const std::string& type,
+    const std::string& name)
+{
+  PluginRegistry *instance = cct->get_plugin_registry();
+
+  return instance->add(type, name, new QccCryptoPlugin(cct));
+}
diff --git a/src/crypto/qat/qat_crypto_plugin.h b/src/crypto/qat/qat_crypto_plugin.h
new file mode 100644
index 000000000..a8d4df7cb
--- /dev/null
+++ b/src/crypto/qat/qat_crypto_plugin.h
@@ -0,0 +1,42 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ * Author: Ganesh Mahalingam <ganesh.mahalingam@intel.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef QAT_CRYPTO_PLUGIN_H
+#define QAT_CRYPTO_PLUGIN_H
+
+#include "crypto/crypto_plugin.h"
+#include "crypto/qat/qat_crypto_accel.h"
+
+
+class QccCryptoPlugin : public CryptoPlugin {
+  static std::mutex qat_init;
+
+public:
+
+  explicit QccCryptoPlugin(CephContext* cct) : CryptoPlugin(cct)
+  {}
+  ~QccCryptoPlugin()
+  {}
+  virtual int factory(CryptoAccelRef *cs, std::ostream *ss)
+  {
+    std::lock_guard<std::mutex> l(qat_init);
+    if (cryptoaccel == nullptr)
+      cryptoaccel = CryptoAccelRef(new QccCryptoAccel);
+
+    *cs = cryptoaccel;
+    return 0;
+  }
+};
+#endif
diff --git a/src/crypto/qat/qcccrypto.cc b/src/crypto/qat/qcccrypto.cc
new file mode 100644
index 000000000..a3f253726
--- /dev/null
+++ b/src/crypto/qat/qcccrypto.cc
@@ -0,0 +1,471 @@
+#include "qcccrypto.h"
+#include  <iostream>
+#include "string.h"
+#include <pthread.h>
+#include "common/debug.h"
+#include "include/scope_guard.h"
+#include "common/dout.h"
+#include "common/errno.h"
+
+// -----------------------------------------------------------------------------
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+static std::ostream& _prefix(std::ostream* _dout)
+{
+  return *_dout << "QccCrypto: ";
+}
+// -----------------------------------------------------------------------------
+
+/*
+ * Poller thread & functions
+*/
+static std::mutex qcc_alloc_mutex;
+static std::mutex qcc_eng_mutex;
+static std::atomic<bool> init_called = { false };
+
+void* QccCrypto::crypt_thread(void *args) {
+  struct qcc_thread_args *thread_args = (struct qcc_thread_args *)args;
+  thread_args->qccinstance->do_crypt(thread_args);
+  return thread_args;
+}
+
+void QccCrypto::QccFreeInstance(int entry) {
+  std::lock_guard<std::mutex> freeinst(qcc_alloc_mutex);
+  open_instances.push(entry);
+}
+
+int QccCrypto::QccGetFreeInstance() {
+  int ret = -1;
+  std::lock_guard<std::mutex> getinst(qcc_alloc_mutex);
+  if (!open_instances.empty()) {
+    ret = open_instances.front();
+    open_instances.pop();
+  }
+  return ret;
+}
+
+void QccCrypto::cleanup() {
+  icp_sal_userStop();
+  qaeMemDestroy();
+  is_init = false;
+  init_stat = stat;
+  init_called = false;
+  derr << "Failure during QAT init sequence. Quitting" << dendl;
+}
+
+/*
+ * We initialize QAT instance and everything that is common for all ops
+*/
+bool QccCrypto::init()
+{
+
+  std::lock_guard<std::mutex> l(qcc_eng_mutex);
+
+  if(init_called) {
+    dout(10) << "Init sequence already called. Skipping duplicate call" << dendl;
+    return true;
+  }
+
+  // First call to init
+  dout(15) << "First init for QAT" << dendl;
+  init_called = true;
+
+  // Find if the usermode memory driver is available. We need to this to
+  // create contiguous memory needed by QAT.
+  stat = qaeMemInit();
+  if(stat != CPA_STATUS_SUCCESS) {
+    derr << "Unable to load memory driver" << dendl;
+    this->cleanup();
+    return false;
+  }
+
+  stat = icp_sal_userStart("CEPH");
+  if(stat != CPA_STATUS_SUCCESS) {
+    derr << "Unable to start qat device" << dendl;
+    this->cleanup();
+    return false;
+  }
+
+  qcc_os_mem_alloc((void **)&qcc_inst, sizeof(QCCINST));
+  if(qcc_inst == NULL) {
+    derr << "Unable to alloc mem for instance struct" << dendl;
+    this->cleanup();
+    return false;
+  }
+
+  // Initialize contents of qcc_inst
+  qcc_inst->num_instances = 0;
+  qcc_inst->cy_inst_handles = NULL;
+
+  stat = cpaCyGetNumInstances(&(qcc_inst->num_instances));
+  if ((stat != CPA_STATUS_SUCCESS) || (qcc_inst->num_instances <= 0)) {
+    derr << "Unable to find available instances" << dendl;
+    this->cleanup();
+    return false;
+  }
+
+  qcc_os_mem_alloc((void **)&qcc_inst->cy_inst_handles,
+      ((int)qcc_inst->num_instances * sizeof(CpaInstanceHandle)));
+  if (qcc_inst->cy_inst_handles == NULL) {
+    derr << "Unable to allocate instances array memory" << dendl;
+    this->cleanup();
+    return false;
+  }
+
+  stat = cpaCyGetInstances(qcc_inst->num_instances, qcc_inst->cy_inst_handles);
+  if (stat != CPA_STATUS_SUCCESS) {
+    derr << "Unable to get instances" << dendl;
+    this->cleanup();
+    return false;
+  }
+
+  int iter = 0;
+  //Start Instances
+  for(iter = 0; iter < qcc_inst->num_instances; iter++) {
+    stat = cpaCyStartInstance(qcc_inst->cy_inst_handles[iter]);
+    if(stat != CPA_STATUS_SUCCESS) {
+      derr << "Unable to start instance" << dendl;
+      this->cleanup();
+      return false;
+    }
+  }
+
+  qcc_os_mem_alloc((void **)&qcc_inst->is_polled,
+      ((int)qcc_inst->num_instances * sizeof(CpaBoolean)));
+  CpaInstanceInfo2 info;
+  for(iter = 0; iter < qcc_inst->num_instances; iter++) {
+    qcc_inst->is_polled[iter] = cpaCyInstanceGetInfo2(qcc_inst->cy_inst_handles[iter],
+        &info) == CPA_STATUS_SUCCESS ? info.isPolled : CPA_FALSE;
+  }
+
+  // Allocate memory structures for all instances
+  qcc_os_mem_alloc((void **)&qcc_sess,
+      ((int)qcc_inst->num_instances * sizeof(QCCSESS)));
+  if(qcc_sess == NULL) {
+    derr << "Unable to allocate memory for session struct" << dendl;
+    this->cleanup();
+    return false;
+  }
+
+  qcc_os_mem_alloc((void **)&qcc_op_mem,
+      ((int)qcc_inst->num_instances * sizeof(QCCOPMEM)));
+  if(qcc_sess == NULL) {
+    derr << "Unable to allocate memory for opmem struct" << dendl;
+    this->cleanup();
+    return false;
+  }
+
+  qcc_os_mem_alloc((void **)&cypollthreads,
+      ((int)qcc_inst->num_instances * sizeof(pthread_t)));
+  if(cypollthreads == NULL) {
+    derr << "Unable to allocate memory for pthreads" << dendl;
+    this->cleanup();
+    return false;
+  }
+
+  //At this point we are only doing an user-space version.
+  //To-Do: Maybe a kernel based one
+  for(iter = 0; iter < qcc_inst->num_instances; iter++) {
+    stat = cpaCySetAddressTranslation(qcc_inst->cy_inst_handles[iter],
+                                       qaeVirtToPhysNUMA);
+    if(stat == CPA_STATUS_SUCCESS) {
+      // Start HW Polling Thread
+      // To-Do: Enable epoll & interrupt based later?
+      // QccCyStartPoll(iter);
+      // Setup the session structures for crypto operation and populate
+      // whatever we can now. Rest will be filled in when crypto operation
+      // happens.
+      qcc_sess[iter].sess_ctx_sz = 0;
+      qcc_sess[iter].sess_ctx = NULL;
+      qcc_sess[iter].sess_stp_data.sessionPriority = CPA_CY_PRIORITY_NORMAL;
+      qcc_sess[iter].sess_stp_data.symOperation = CPA_CY_SYM_OP_CIPHER;
+      open_instances.push(iter);
+      qcc_op_mem[iter].is_mem_alloc = false;
+      qcc_op_mem[iter].op_complete = false;
+      qcc_op_mem[iter].op_result = CPA_STATUS_SUCCESS;
+      qcc_op_mem[iter].sym_op_data = NULL;
+      qcc_op_mem[iter].buff_meta_size = qcc_op_mem[iter].buff_size = 0;
+      qcc_op_mem[iter].src_buff_meta = qcc_op_mem[iter].src_buff
+        = qcc_op_mem[iter].iv_buff = NULL;
+      qcc_op_mem[iter].src_buff_list = NULL;
+      qcc_op_mem[iter].src_buff_flat = NULL;
+      qcc_op_mem[iter].num_buffers = 1;
+    } else {
+      derr << "Unable to find address translations of instance " << iter << dendl;
+      this->cleanup();
+      return false;
+    }
+  }
+  is_init = true;
+  dout(10) << "Init complete" << dendl;
+  return true;
+}
+
+bool QccCrypto::destroy() {
+  if((!is_init) || (!init_called)) {
+    dout(15) << "QAT not initialized here. Nothing to do" << dendl;
+    return false;
+  }
+
+  unsigned int retry = 0;
+  while(retry <= QCC_MAX_RETRIES) {
+    if(open_instances.size() == qcc_inst->num_instances) {
+      break;
+    } else {
+      retry++;
+    }
+    dout(5) << "QAT is still busy and cannot free resources yet" << dendl;
+    return false;
+  }
+
+  dout(10) << "Destroying QAT crypto & related memory" << dendl;
+  int iter = 0;
+
+  // Free up op related memory
+  for (iter =0; iter < qcc_inst->num_instances; iter++) {
+    qcc_contig_mem_free((void **)&(qcc_op_mem[iter].src_buff));
+    qcc_contig_mem_free((void **)&(qcc_op_mem[iter].iv_buff));
+    qcc_os_mem_free((void **)&(qcc_op_mem[iter].src_buff_list));
+    qcc_os_mem_free((void **)&(qcc_op_mem[iter].src_buff_flat));
+    qcc_contig_mem_free((void **)&(qcc_op_mem[iter].sym_op_data));
+  }
+
+  // Free up Session memory
+  for(iter = 0; iter < qcc_inst->num_instances; iter++) {
+    cpaCySymRemoveSession(qcc_inst->cy_inst_handles[iter], qcc_sess[iter].sess_ctx);
+    qcc_contig_mem_free((void **)&(qcc_sess[iter].sess_ctx));
+  }
+
+  // Stop QAT Instances
+  for(iter = 0; iter < qcc_inst->num_instances; iter++) {
+    cpaCyStopInstance(qcc_inst->cy_inst_handles[iter]);
+  }
+
+  // Free up the base structures we use
+  qcc_os_mem_free((void **)&qcc_op_mem);
+  qcc_os_mem_free((void **)&qcc_sess);
+  qcc_os_mem_free((void **)&(qcc_inst->cy_inst_handles));
+  qcc_os_mem_free((void **)&(qcc_inst->is_polled));
+  qcc_os_mem_free((void **)&cypollthreads);
+  qcc_os_mem_free((void **)&qcc_inst);
+
+  //Un-init memory driver and QAT HW
+  icp_sal_userStop();
+  qaeMemDestroy();
+  init_called = false;
+  is_init = false;
+  return true;
+}
+
+void QccCrypto::do_crypt(qcc_thread_args *thread_args) {
+  auto entry = thread_args->entry;
+  qcc_op_mem[entry].op_result = cpaCySymPerformOp(qcc_inst->cy_inst_handles[entry],
+      NULL,
+      qcc_op_mem[entry].sym_op_data,
+      qcc_op_mem[entry].src_buff_list,
+      qcc_op_mem[entry].src_buff_list,
+      NULL);
+  qcc_op_mem[entry].op_complete = true;
+  free(thread_args);
+}
+
+bool QccCrypto::perform_op(unsigned char* out, const unsigned char* in,
+    size_t size, uint8_t *iv, uint8_t *key, CpaCySymCipherDirection op_type)
+{
+  if (!init_called) {
+    dout(10) << "QAT not intialized yet. Initializing now..." << dendl;
+    if(!QccCrypto::init()) {
+      derr << "QAT init failed" << dendl;
+      return false;
+    }
+  }
+
+  if(!is_init)
+  {
+    dout(10) << "QAT not initialized in this instance or init failed with possible error " << (int)init_stat << dendl;
+    return is_init;
+  }
+
+  int avail_inst = -1;
+  unsigned int retrycount = 0;
+  while(retrycount <= QCC_MAX_RETRIES) {
+    avail_inst = QccGetFreeInstance();
+    if(avail_inst != -1) {
+      break;
+    } else {
+      retrycount++;
+      usleep(qcc_sleep_duration);
+    }
+  }
+
+  if(avail_inst == -1) {
+    derr << "Unable to get an QAT instance. Failing request" << dendl;
+    return false;
+  }
+
+  dout(15) << "Using inst " << avail_inst << dendl;
+  // Start polling threads for this instance
+  //QccCyStartPoll(avail_inst);
+
+  auto sg = make_scope_guard([=] {
+      //free up the instance irrespective of the op status
+      dout(15) << "Completed task under " << avail_inst << dendl;
+      qcc_op_mem[avail_inst].op_complete = false;
+      QccCrypto::QccFreeInstance(avail_inst);
+      });
+
+  /*
+   * Allocate buffers for this version of the instance if not already done.
+   * Hold onto to most of them until destructor is called.
+  */
+  if (qcc_op_mem[avail_inst].is_mem_alloc == false) {
+
+    qcc_sess[avail_inst].sess_stp_data.cipherSetupData.cipherAlgorithm =
+                                                     CPA_CY_SYM_CIPHER_AES_CBC;
+    qcc_sess[avail_inst].sess_stp_data.cipherSetupData.cipherKeyLenInBytes =
+                                                              AES_256_KEY_SIZE;
+
+    // Allocate contig memory for buffers that are independent of the
+    // input/output
+    stat = cpaCyBufferListGetMetaSize(qcc_inst->cy_inst_handles[avail_inst],
+        qcc_op_mem[avail_inst].num_buffers, &(qcc_op_mem[avail_inst].buff_meta_size));
+    if(stat != CPA_STATUS_SUCCESS) {
+      derr << "Unable to get buff meta size" << dendl;
+      return false;
+    }
+
+    // Allocate Buffer List Private metadata
+    stat = qcc_contig_mem_alloc((void **)&(qcc_op_mem[avail_inst].src_buff_meta),
+        qcc_op_mem[avail_inst].buff_meta_size, 1);
+    if(stat != CPA_STATUS_SUCCESS) {
+      derr << "Unable to allocate private metadata memory" << dendl;
+      return false;
+    }
+
+    // Allocate Buffer List Memory
+    qcc_os_mem_alloc((void **)&(qcc_op_mem[avail_inst].src_buff_list), sizeof(CpaBufferList));
+    qcc_os_mem_alloc((void **)&(qcc_op_mem[avail_inst].src_buff_flat),
+              (qcc_op_mem[avail_inst].num_buffers * sizeof(CpaFlatBuffer)));
+    if(qcc_op_mem[avail_inst].src_buff_list == NULL || qcc_op_mem[avail_inst].src_buff_flat == NULL) {
+      derr << "Unable to allocate bufferlist memory" << dendl;
+      return false;
+    }
+
+    // Allocate IV memory
+    stat = qcc_contig_mem_alloc((void **)&(qcc_op_mem[avail_inst].iv_buff), AES_256_IV_LEN);
+    if(stat != CPA_STATUS_SUCCESS) {
+      derr << "Unable to allocate bufferlist memory" << dendl;
+      return false;
+    }
+
+    //Assign src stuff for the operation
+    (qcc_op_mem[avail_inst].src_buff_list)->pBuffers = qcc_op_mem[avail_inst].src_buff_flat;
+    (qcc_op_mem[avail_inst].src_buff_list)->numBuffers = qcc_op_mem[avail_inst].num_buffers;
+    (qcc_op_mem[avail_inst].src_buff_list)->pPrivateMetaData = qcc_op_mem[avail_inst].src_buff_meta;
+
+    //Setup OpData
+    stat = qcc_contig_mem_alloc((void **)&(qcc_op_mem[avail_inst].sym_op_data),
+        sizeof(CpaCySymOpData));
+    if(stat != CPA_STATUS_SUCCESS) {
+      derr << "Unable to allocate opdata memory" << dendl;
+      return false;
+    }
+
+    // Assuming op to be encryption for initiation. This will be reset when we
+    // exit this block
+    qcc_sess[avail_inst].sess_stp_data.cipherSetupData.cipherDirection =
+                                            CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT;
+    // Allocate Session memory
+    stat = cpaCySymSessionCtxGetSize(qcc_inst->cy_inst_handles[avail_inst],
+        &(qcc_sess[avail_inst].sess_stp_data), &(qcc_sess[avail_inst].sess_ctx_sz));
+    if(stat != CPA_STATUS_SUCCESS) {
+      derr << "Unable to find session size" << dendl;
+      return false;
+    }
+
+    stat = qcc_contig_mem_alloc((void **)&(qcc_sess[avail_inst].sess_ctx),
+        qcc_sess[avail_inst].sess_ctx_sz);
+    if(stat != CPA_STATUS_SUCCESS) {
+      derr << "Unable to allocate contig memory" << dendl;
+      return false;
+    }
+
+    // Set memalloc flag so that we don't go through this exercise again.
+    qcc_op_mem[avail_inst].is_mem_alloc = true;
+    dout(15) << "Instantiation complete for " << avail_inst << dendl;
+  }
+
+  // Section that runs on every call
+  // Identify the operation and assign to session
+  qcc_sess[avail_inst].sess_stp_data.cipherSetupData.cipherDirection = op_type;
+  qcc_sess[avail_inst].sess_stp_data.cipherSetupData.pCipherKey = (Cpa8U *)key;
+
+  stat = cpaCySymInitSession(qcc_inst->cy_inst_handles[avail_inst],
+      NULL,
+      &(qcc_sess[avail_inst].sess_stp_data),
+      qcc_sess[avail_inst].sess_ctx);
+  if (stat != CPA_STATUS_SUCCESS) {
+    derr << "Unable to init session" << dendl;
+    return false;
+  }
+
+  // Allocate actual buffers that will hold data
+  if (qcc_op_mem[avail_inst].buff_size != (Cpa32U)size) {
+    qcc_contig_mem_free((void **)&(qcc_op_mem[avail_inst].src_buff));
+    qcc_op_mem[avail_inst].buff_size = (Cpa32U)size;
+    stat = qcc_contig_mem_alloc((void **)&(qcc_op_mem[avail_inst].src_buff),
+              qcc_op_mem[avail_inst].buff_size);
+    if(stat != CPA_STATUS_SUCCESS) {
+      derr << "Unable to allocate contig memory" << dendl;
+      return false;
+    }
+  }
+
+  // Copy src & iv into the respective buffers
+  memcpy(qcc_op_mem[avail_inst].src_buff, in, size);
+  memcpy(qcc_op_mem[avail_inst].iv_buff, iv, AES_256_IV_LEN);
+
+  //Assign the reminder of the stuff
+  qcc_op_mem[avail_inst].src_buff_flat->dataLenInBytes = qcc_op_mem[avail_inst].buff_size;
+  qcc_op_mem[avail_inst].src_buff_flat->pData = qcc_op_mem[avail_inst].src_buff;
+
+  //OpData assignment
+  qcc_op_mem[avail_inst].sym_op_data->sessionCtx = qcc_sess[avail_inst].sess_ctx;
+  qcc_op_mem[avail_inst].sym_op_data->packetType = CPA_CY_SYM_PACKET_TYPE_FULL;
+  qcc_op_mem[avail_inst].sym_op_data->pIv = qcc_op_mem[avail_inst].iv_buff;
+  qcc_op_mem[avail_inst].sym_op_data->ivLenInBytes = AES_256_IV_LEN;
+  qcc_op_mem[avail_inst].sym_op_data->cryptoStartSrcOffsetInBytes = 0;
+  qcc_op_mem[avail_inst].sym_op_data->messageLenToCipherInBytes = qcc_op_mem[avail_inst].buff_size;
+
+  // Perform cipher operation in a thread
+  qcc_thread_args* thread_args = new qcc_thread_args();
+  thread_args->qccinstance = this;
+  thread_args->entry = avail_inst;
+
+  if (pthread_create(&cypollthreads[avail_inst], NULL, crypt_thread, (void *)thread_args) != 0) {
+    derr << "Unable to create thread for crypt operation" << dendl;
+    return false;
+  }
+  if (qcc_inst->is_polled[avail_inst] == CPA_TRUE) {
+    while (!qcc_op_mem[avail_inst].op_complete) {
+      icp_sal_CyPollInstance(qcc_inst->cy_inst_handles[avail_inst], 0);
+    }
+  }
+  pthread_join(cypollthreads[avail_inst], NULL);
+
+  if(qcc_op_mem[avail_inst].op_result != CPA_STATUS_SUCCESS) {
+    derr << "Unable to perform crypt operation" << dendl;
+    return false;
+  }
+
+  //Copy data back to out buffer
+  memcpy(out, qcc_op_mem[avail_inst].src_buff, size);
+  //Always cleanup memory holding user-data at the end
+  memset(qcc_op_mem[avail_inst].iv_buff, 0, AES_256_IV_LEN);
+  memset(qcc_op_mem[avail_inst].src_buff, 0, qcc_op_mem[avail_inst].buff_size);
+
+  return true;
+}
diff --git a/src/crypto/qat/qcccrypto.h b/src/crypto/qat/qcccrypto.h
new file mode 100644
index 000000000..a36b0898b
--- /dev/null
+++ b/src/crypto/qat/qcccrypto.h
@@ -0,0 +1,176 @@
+#ifndef QCCCRYPTO_H
+#define QCCCRYPTO_H
+
+#include <atomic>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <queue>
+extern "C" {
+#include "cpa.h"
+#include "lac/cpa_cy_sym.h"
+#include "lac/cpa_cy_im.h"
+#include "qae_mem.h"
+#include "icp_sal_user.h"
+#include "icp_sal_poll.h"
+#include "qae_mem_utils.h"
+}
+
+class QccCrypto {
+
+  public:
+    CpaCySymCipherDirection qcc_op_type;
+
+    QccCrypto() {};
+    ~QccCrypto() {};
+
+    bool init();
+    bool destroy();
+    bool perform_op(unsigned char* out, const unsigned char* in, size_t size,
+        uint8_t *iv,
+        uint8_t *key,
+        CpaCySymCipherDirection op_type);
+
+  private:
+
+    // Currently only supporting AES_256_CBC.
+    // To-Do: Needs to be expanded
+    static const size_t AES_256_IV_LEN = 16;
+    static const size_t AES_256_KEY_SIZE = 32;
+    static const size_t QCC_MAX_RETRIES = 5000;
+
+    /*
+     * Struct to hold an instance of QAT to handle the crypto operations. These
+     * will be identified at the start and held until the destructor is called
+     * To-Do:
+     * The struct was creating assuming that we will use all the instances.
+     * Expand current implementation to allow multiple instances to operate
+     * independently.
+     */
+    struct QCCINST {
+      CpaInstanceHandle *cy_inst_handles;
+      CpaBoolean *is_polled;
+      Cpa16U num_instances;
+    } *qcc_inst;
+
+    /*
+     * QAT Crypto Session
+     * Crypto Session Context and setupdata holds
+     * priority, type of crypto operation (cipher/chained),
+     * cipher algorithm (AES, DES, etc),
+     * single crypto or multi-buffer crypto.
+     */
+    struct QCCSESS {
+      CpaCySymSessionSetupData sess_stp_data;
+      Cpa32U sess_ctx_sz;
+      CpaCySymSessionCtx sess_ctx;
+    } *qcc_sess;
+
+    /*
+     * Cipher Memory Allocations
+     * Holds bufferlist, flatbuffer, cipher opration data and buffermeta needed
+     * by QAT to perform the operation. Also buffers for IV, SRC, DEST.
+     */
+    struct QCCOPMEM {
+      // Op common  items
+      bool is_mem_alloc;
+      bool op_complete;
+      CpaStatus op_result;
+      CpaCySymOpData *sym_op_data;
+      Cpa32U buff_meta_size;
+      Cpa32U num_buffers;
+      Cpa32U buff_size;
+
+      //Src data items
+      Cpa8U *src_buff_meta;
+      CpaBufferList *src_buff_list;
+      CpaFlatBuffer *src_buff_flat;
+      Cpa8U *src_buff;
+      Cpa8U *iv_buff;
+    } *qcc_op_mem;
+
+    //QAT HW polling thread input structure
+    struct qcc_thread_args {
+      QccCrypto* qccinstance;
+      int entry;
+    };
+
+
+    /*
+     * Function to handle the crypt operation. Will run while the main thread
+     * runs the polling function on the instance doing the op
+     */
+    void do_crypt(qcc_thread_args *thread_args);
+
+    /*
+     * Handle queue with free instances to handle op
+     */
+    std::queue<int> open_instances;
+    int QccGetFreeInstance();
+    void QccFreeInstance(int entry);
+
+    /*
+     * Contiguous Memory Allocator and de-allocator. We are using the usdm
+     * driver that comes along with QAT to get us direct memory access using
+     * hugepages.
+     * To-Do: A kernel based one.
+     */
+    static inline void qcc_contig_mem_free(void **ptr) {
+      if (*ptr) {
+        qaeMemFreeNUMA(ptr);
+        *ptr = NULL;
+      }
+    }
+
+    static inline CpaStatus qcc_contig_mem_alloc(void **ptr, Cpa32U size, Cpa32U alignment = 1) {
+      *ptr = qaeMemAllocNUMA(size, 0, alignment);
+      if (NULL == *ptr)
+      {
+        return CPA_STATUS_RESOURCE;
+      }
+      return CPA_STATUS_SUCCESS;
+    }
+
+    /*
+     * Malloc & free calls masked to maintain consistency and future kernel
+     * alloc support.
+     */
+    static inline void qcc_os_mem_free(void **ptr) {
+      if (*ptr) {
+        free(*ptr);
+        *ptr = NULL;
+      }
+    }
+
+    static inline CpaStatus qcc_os_mem_alloc(void **ptr, Cpa32U size) {
+      *ptr = malloc(size);
+      if (*ptr == NULL)
+      {
+        return CPA_STATUS_RESOURCE;
+      }
+      return CPA_STATUS_SUCCESS;
+    }
+
+    std::atomic<bool> is_init = { false };
+    CpaStatus init_stat, stat;
+
+    /*
+     * Function to cleanup memory if constructor fails
+     */
+    void cleanup();
+
+    /*
+     * Crypto Polling Function & helpers
+     * This helps to retrieve data from the QAT rings and dispatching the
+     * associated callbacks. For synchronous operation (like this one), QAT
+     * library creates an internal callback for the operation.
+     */
+    static void* crypt_thread(void* entry);
+    CpaStatus QccCyStartPoll(int entry);
+    void poll_instance(int entry);
+
+    pthread_t *cypollthreads;
+    static const size_t qcc_sleep_duration = 2;
+};
+#endif //QCCCRYPTO_H
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
commit	e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree	64f88b554b444a49f656b6c656111a145cbbaa28 /src/crypto
parent	Initial commit. (diff)
download	ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip